2023-03-31 17:26:02 +02:00
73 changed files with 7796 additions and 731 deletions
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@ -17,7 +17,11 @@ endif()
 add_subdirectory(rangetree)
 add_subdirectory(wcwidth)
-add_subdirectory(perceptualdiff)
+#FRL_CLR_BEGIN
 if(UNIX AND NOT APPLE)
  add_subdirectory(perceptualdiff)
 endif()
 #FRL_CLR_END
 if(WITH_BULLET)
  if(NOT WITH_SYSTEM_BULLET)
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Deprecation_Notices.pdf
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Deprecation_Notices.pdf
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/cuviddec.h
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/cuviddec.h
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvEncodeAPI.h
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvEncodeAPI.h
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvcuvid.h
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvcuvid.h
@ -0,0 +1,436 @@
 /*
 * This copyright notice applies to this header file only:
 *
 * Copyright (c) 2010-2021 NVIDIA Corporation
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the software, and to permit persons to whom the
 * software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 /********************************************************************************************************************/
 //! \file nvcuvid.h
 //!   NVDECODE API provides video decoding interface to NVIDIA GPU devices.
 //! \date 2015-2020
 //!  This file contains the interface constants, structure definitions and function prototypes.
 /********************************************************************************************************************/
 #if !defined(__NVCUVID_H__)
 #define __NVCUVID_H__
 #include "cuviddec.h"
 #if defined(__cplusplus)
 extern "C" {
 #endif /* __cplusplus */
 /***********************************************/
 //!
 //! High-level helper APIs for video sources
 //!
 /***********************************************/
 typedef void *CUvideosource;
 typedef void *CUvideoparser;
 typedef long long CUvideotimestamp;
 /************************************************************************/
 //! \enum cudaVideoState
 //! Video source state enums
 //! Used in cuvidSetVideoSourceState and cuvidGetVideoSourceState APIs
 /************************************************************************/
 typedef enum {
    cudaVideoState_Error   = -1,    /**< Error state (invalid source)                  */
    cudaVideoState_Stopped = 0,     /**< Source is stopped (or reached end-of-stream)  */
    cudaVideoState_Started = 1      /**< Source is running and delivering data         */
 } cudaVideoState;
 /************************************************************************/
 //! \enum cudaAudioCodec
 //! Audio compression enums
 //! Used in CUAUDIOFORMAT structure
 /************************************************************************/
 typedef enum {
    cudaAudioCodec_MPEG1=0,         /**< MPEG-1 Audio               */
    cudaAudioCodec_MPEG2,           /**< MPEG-2 Audio               */
    cudaAudioCodec_MP3,             /**< MPEG-1 Layer III Audio     */
    cudaAudioCodec_AC3,             /**< Dolby Digital (AC3) Audio  */
    cudaAudioCodec_LPCM,            /**< PCM Audio                  */
    cudaAudioCodec_AAC,             /**< AAC Audio                  */
 } cudaAudioCodec;
 /************************************************************************************************/
 //! \ingroup STRUCTS
 //! \struct CUVIDEOFORMAT
 //! Video format
 //! Used in cuvidGetSourceVideoFormat API
 /************************************************************************************************/
 typedef struct
 {
    cudaVideoCodec codec;                   /**< OUT: Compression format          */
   /**
    * OUT: frame rate = numerator / denominator (for example: 30000/1001)
    */
    struct {
        /**< OUT: frame rate numerator   (0 = unspecified or variable frame rate) */
        unsigned int numerator;
        /**< OUT: frame rate denominator (0 = unspecified or variable frame rate) */
        unsigned int denominator;
    } frame_rate;
    unsigned char progressive_sequence;     /**< OUT: 0=interlaced, 1=progressive                                      */
    unsigned char bit_depth_luma_minus8;    /**< OUT: high bit depth luma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth   */
    unsigned char bit_depth_chroma_minus8;  /**< OUT: high bit depth chroma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */
    unsigned char min_num_decode_surfaces;  /**< OUT: Minimum number of decode surfaces to be allocated for correct
                                                      decoding. The client can send this value in ulNumDecodeSurfaces
                                                      (in CUVIDDECODECREATEINFO structure).
                                                      This guarantees correct functionality and optimal video memory
                                                      usage but not necessarily the best performance, which depends on
                                                      the design of the overall application. The optimal number of
                                                      decode surfaces (in terms of performance and memory utilization)
                                                      should be decided by experimentation for each application, but it
                                                      cannot go below min_num_decode_surfaces.
                                                      If this value is used for ulNumDecodeSurfaces then it must be
                                                      returned to parser during sequence callback.                     */
    unsigned int coded_width;               /**< OUT: coded frame width in pixels                                      */
    unsigned int coded_height;              /**< OUT: coded frame height in pixels                                     */
   /**
    * area of the frame that should be displayed
    * typical example:
    * coded_width = 1920, coded_height = 1088
    * display_area = { 0,0,1920,1080 }
    */
    struct {
        int left;                           /**< OUT: left position of display rect    */
        int top;                            /**< OUT: top position of display rect     */
        int right;                          /**< OUT: right position of display rect   */
        int bottom;                         /**< OUT: bottom position of display rect  */
    } display_area;
    cudaVideoChromaFormat chroma_format;    /**< OUT:  Chroma format                   */
    unsigned int bitrate;                   /**< OUT: video bitrate (bps, 0=unknown)   */
   /**
    * OUT: Display Aspect Ratio = x:y (4:3, 16:9, etc)
    */
    struct {
        int x;
        int y;
    } display_aspect_ratio;
    /**
    * Video Signal Description
    * Refer section E.2.1 (VUI parameters semantics) of H264 spec file
    */
    struct {
        unsigned char video_format          : 3; /**< OUT: 0-Component, 1-PAL, 2-NTSC, 3-SECAM, 4-MAC, 5-Unspecified     */
        unsigned char video_full_range_flag : 1; /**< OUT: indicates the black level and luma and chroma range           */
        unsigned char reserved_zero_bits    : 4; /**< Reserved bits                                                      */
        unsigned char color_primaries;           /**< OUT: chromaticity coordinates of source primaries                  */
        unsigned char transfer_characteristics;  /**< OUT: opto-electronic transfer characteristic of the source picture */
        unsigned char matrix_coefficients;       /**< OUT: used in deriving luma and chroma signals from RGB primaries   */
    } video_signal_description;
    unsigned int seqhdr_data_length;             /**< OUT: Additional bytes following (CUVIDEOFORMATEX)                  */
 } CUVIDEOFORMAT;
 /****************************************************************/
 //! \ingroup STRUCTS
 //! \struct CUVIDOPERATINGPOINTINFO
 //! Operating point information of scalable bitstream
 /****************************************************************/
 typedef struct 
 {
    cudaVideoCodec codec;
    union 
    {
        struct
        {
            unsigned char  operating_points_cnt;
            unsigned char  reserved24_bits[3];
            unsigned short operating_points_idc[32];
        } av1;
        unsigned char CodecReserved[1024];
    };
 } CUVIDOPERATINGPOINTINFO;
 /****************************************************************/
 //! \ingroup STRUCTS
 //! \struct CUVIDAV1SEQHDR
 //! AV1 specific sequence header information
 /****************************************************************/
 typedef struct {
    unsigned int max_width;
    unsigned int max_height;
    unsigned char reserved[1016];
 } CUVIDAV1SEQHDR;
 /****************************************************************/
 //! \ingroup STRUCTS
 //! \struct CUVIDEOFORMATEX
 //! Video format including raw sequence header information
 //! Used in cuvidGetSourceVideoFormat API
 /****************************************************************/
 typedef struct
 {
    CUVIDEOFORMAT format;                 /**< OUT: CUVIDEOFORMAT structure */
    union {
        CUVIDAV1SEQHDR av1;
        unsigned char raw_seqhdr_data[1024];  /**< OUT: Sequence header data    */
    };
 } CUVIDEOFORMATEX;
 /****************************************************************/
 //! \ingroup STRUCTS
 //! \struct CUAUDIOFORMAT
 //! Audio formats
 //! Used in cuvidGetSourceAudioFormat API
 /****************************************************************/
 typedef struct
 {
    cudaAudioCodec codec;       /**< OUT: Compression format                                              */
    unsigned int channels;      /**< OUT: number of audio channels                                        */
    unsigned int samplespersec; /**< OUT: sampling frequency                                              */
    unsigned int bitrate;       /**< OUT: For uncompressed, can also be used to determine bits per sample */
    unsigned int reserved1;     /**< Reserved for future use                                              */
    unsigned int reserved2;     /**< Reserved for future use                                              */
 } CUAUDIOFORMAT;
 /***************************************************************/
 //! \enum CUvideopacketflags
 //! Data packet flags
 //! Used in CUVIDSOURCEDATAPACKET structure
 /***************************************************************/
 typedef enum {
    CUVID_PKT_ENDOFSTREAM   = 0x01,   /**< Set when this is the last packet for this stream                              */
    CUVID_PKT_TIMESTAMP     = 0x02,   /**< Timestamp is valid                                                            */
    CUVID_PKT_DISCONTINUITY = 0x04,   /**< Set when a discontinuity has to be signalled                                  */
    CUVID_PKT_ENDOFPICTURE  = 0x08,   /**< Set when the packet contains exactly one frame or one field                   */
    CUVID_PKT_NOTIFY_EOS    = 0x10,   /**< If this flag is set along with CUVID_PKT_ENDOFSTREAM, an additional (dummy)
                                           display callback will be invoked with null value of CUVIDPARSERDISPINFO which
                                           should be interpreted as end of the stream.                                   */
 } CUvideopacketflags;
 /*****************************************************************************/
 //! \ingroup STRUCTS
 //! \struct CUVIDSOURCEDATAPACKET
 //! Data Packet
 //! Used in cuvidParseVideoData API
 //! IN for cuvidParseVideoData
 /*****************************************************************************/
 typedef struct _CUVIDSOURCEDATAPACKET
 {
    unsigned long flags;            /**< IN: Combination of CUVID_PKT_XXX flags                              */
    unsigned long payload_size;     /**< IN: number of bytes in the payload (may be zero if EOS flag is set) */
    const unsigned char *payload;   /**< IN: Pointer to packet payload data (may be NULL if EOS flag is set) */
    CUvideotimestamp timestamp;     /**< IN: Presentation time stamp (10MHz clock), only valid if
                                             CUVID_PKT_TIMESTAMP flag is set                                 */
 } CUVIDSOURCEDATAPACKET;
 // Callback for packet delivery
 typedef int (CUDAAPI *PFNVIDSOURCECALLBACK)(void *, CUVIDSOURCEDATAPACKET *);
 /**************************************************************************************************************************/
 //! \ingroup STRUCTS
 //! \struct CUVIDSOURCEPARAMS
 //! Describes parameters needed in cuvidCreateVideoSource API
 //! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported
 //! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
 /**************************************************************************************************************************/
 typedef struct _CUVIDSOURCEPARAMS
 {
    unsigned int ulClockRate;                   /**< IN: Time stamp units in Hz (0=default=10000000Hz)      */
    unsigned int bAnnexb : 1;                   /**< IN: AV1 annexB stream                                  */
    unsigned int uReserved : 31;                /**< Reserved for future use - set to zero                  */
    unsigned int uReserved1[6];                 /**< Reserved for future use - set to zero                  */
    void *pUserData;                            /**< IN: User private data passed in to the data handlers   */
    PFNVIDSOURCECALLBACK pfnVideoDataHandler;   /**< IN: Called to deliver video packets                    */
    PFNVIDSOURCECALLBACK pfnAudioDataHandler;   /**< IN: Called to deliver audio packets.                   */
    void *pvReserved2[8];                       /**< Reserved for future use - set to NULL                  */
 } CUVIDSOURCEPARAMS;
 /**********************************************/
 //! \ingroup ENUMS
 //! \enum CUvideosourceformat_flags
 //! CUvideosourceformat_flags
 //! Used in cuvidGetSourceVideoFormat API
 /**********************************************/
 typedef enum {
    CUVID_FMT_EXTFORMATINFO = 0x100             /**< Return extended format structure (CUVIDEOFORMATEX) */
 } CUvideosourceformat_flags;
 #if !defined(__APPLE__)
 /***************************************************************************************************************************/
 //! \ingroup FUNCTS
 //! \fn CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams)
 //! Create CUvideosource object. CUvideosource spawns demultiplexer thread that provides two callbacks: 
 //! pfnVideoDataHandler() and pfnAudioDataHandler()
 //! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported 
 //! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
 /***************************************************************************************************************************/
 CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams);
 /***************************************************************************************************************************/
 //! \ingroup FUNCTS
 //! \fn CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams)
 //! Create video source
 /***************************************************************************************************************************/
 CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams);
 /********************************************************************/
 //! \ingroup FUNCTS
 //! \fn CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj)
 //! Destroy video source
 /********************************************************************/
 CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj);
 /******************************************************************************************/
 //! \ingroup FUNCTS
 //! \fn CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state)
 //! Set video source state to:
 //! cudaVideoState_Started - to signal the source to run and deliver data
 //! cudaVideoState_Stopped - to stop the source from delivering the data
 //! cudaVideoState_Error   - invalid source
 /******************************************************************************************/
 CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state);
 /******************************************************************************************/
 //! \ingroup FUNCTS
 //! \fn cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj)
 //! Get video source state
 //! Returns:
 //! cudaVideoState_Started - if Source is running and delivering data
 //! cudaVideoState_Stopped - if Source is stopped or reached end-of-stream
 //! cudaVideoState_Error   - if Source is in error state
 /******************************************************************************************/
 cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj);
 /******************************************************************************************************************/
 //! \ingroup FUNCTS
 //! \fn CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags)
 //! Gets video source format in pvidfmt, flags is set to combination of CUvideosourceformat_flags as per requirement
 /******************************************************************************************************************/
 CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags);
 /**************************************************************************************************************************/
 //! \ingroup FUNCTS
 //! \fn CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags)
 //! Get audio source format
 //! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported 
 //! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
 /**************************************************************************************************************************/
 CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags);
 #endif
 /**********************************************************************************/
 //! \ingroup STRUCTS
 //! \struct CUVIDPARSERDISPINFO
 //! Used in cuvidParseVideoData API with PFNVIDDISPLAYCALLBACK pfnDisplayPicture
 /**********************************************************************************/
 typedef struct _CUVIDPARSERDISPINFO
 {
    int picture_index;          /**< OUT: Index of the current picture                                                         */
    int progressive_frame;      /**< OUT: 1 if progressive frame; 0 otherwise                                                  */
    int top_field_first;        /**< OUT: 1 if top field is displayed first; 0 otherwise                                       */
    int repeat_first_field;     /**< OUT: Number of additional fields (1=ivtc, 2=frame doubling, 4=frame tripling, 
                                     -1=unpaired field)                                                                        */
    CUvideotimestamp timestamp; /**< OUT: Presentation time stamp                                                              */
 } CUVIDPARSERDISPINFO;
 /***********************************************************************************************************************/
 //! Parser callbacks
 //! The parser will call these synchronously from within cuvidParseVideoData(), whenever there is sequence change or a picture
 //! is ready to be decoded and/or displayed. First argument in functions is "void *pUserData" member of structure CUVIDSOURCEPARAMS
 //! Return values from these callbacks are interpreted as below. If the callbacks return failure, it will be propagated by
 //! cuvidParseVideoData() to the application.
 //! Parser picks default operating point as 0 and outputAllLayers flag as 0 if PFNVIDOPPOINTCALLBACK is not set or return value is 
 //! -1 or invalid operating point.
 //! PFNVIDSEQUENCECALLBACK : 0: fail, 1: succeeded, > 1: override dpb size of parser (set by CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces
 //! while creating parser)
 //! PFNVIDDECODECALLBACK   : 0: fail, >=1: succeeded
 //! PFNVIDDISPLAYCALLBACK  : 0: fail, >=1: succeeded
 //! PFNVIDOPPOINTCALLBACK  : <0: fail, >=0: succeeded (bit 0-9: OperatingPoint, bit 10-10: outputAllLayers, bit 11-30: reserved)
 /***********************************************************************************************************************/
 typedef int (CUDAAPI *PFNVIDSEQUENCECALLBACK)(void *, CUVIDEOFORMAT *);
 typedef int (CUDAAPI *PFNVIDDECODECALLBACK)(void *, CUVIDPICPARAMS *);
 typedef int (CUDAAPI *PFNVIDDISPLAYCALLBACK)(void *, CUVIDPARSERDISPINFO *);
 typedef int (CUDAAPI *PFNVIDOPPOINTCALLBACK)(void *, CUVIDOPERATINGPOINTINFO*);
 /**************************************/
 //! \ingroup STRUCTS
 //! \struct CUVIDPARSERPARAMS
 //! Used in cuvidCreateVideoParser API
 /**************************************/
 typedef struct _CUVIDPARSERPARAMS
 {
    cudaVideoCodec CodecType;                   /**< IN: cudaVideoCodec_XXX                                                  */
    unsigned int ulMaxNumDecodeSurfaces;        /**< IN: Max # of decode surfaces (parser will cycle through these)          */
    unsigned int ulClockRate;                   /**< IN: Timestamp units in Hz (0=default=10000000Hz)                        */
    unsigned int ulErrorThreshold;              /**< IN: % Error threshold (0-100) for calling pfnDecodePicture (100=always 
                                                     IN: call pfnDecodePicture even if picture bitstream is fully corrupted) */
    unsigned int ulMaxDisplayDelay;             /**< IN: Max display queue delay (improves pipelining of decode with display)
                                                         0=no delay (recommended values: 2..4)                               */
    unsigned int bAnnexb : 1;                   /**< IN: AV1 annexB stream                                                   */
    unsigned int uReserved : 31;                /**< Reserved for future use - set to zero                                   */
    unsigned int uReserved1[4];                 /**< IN: Reserved for future use - set to 0                                  */
    void *pUserData;                            /**< IN: User data for callbacks                                             */
    PFNVIDSEQUENCECALLBACK pfnSequenceCallback; /**< IN: Called before decoding frames and/or whenever there is a fmt change */
    PFNVIDDECODECALLBACK pfnDecodePicture;      /**< IN: Called when a picture is ready to be decoded (decode order)         */
    PFNVIDDISPLAYCALLBACK pfnDisplayPicture;    /**< IN: Called whenever a picture is ready to be displayed (display order)  */
    PFNVIDOPPOINTCALLBACK pfnGetOperatingPoint; /**< IN: Called from AV1 sequence header to get operating point of a AV1 
                                                         scalable bitstream                                                  */
    void *pvReserved2[6];                       /**< Reserved for future use - set to NULL                                   */
    CUVIDEOFORMATEX *pExtVideoInfo;             /**< IN: [Optional] sequence header data from system layer                   */
 } CUVIDPARSERPARAMS;
 /************************************************************************************************/
 //! \ingroup FUNCTS
 //! \fn CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams)
 //! Create video parser object and initialize
 /************************************************************************************************/
 CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams);
 /************************************************************************************************/
 //! \ingroup FUNCTS
 //! \fn CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket)
 //! Parse the video data from source data packet in pPacket 
 //! Extracts parameter sets like SPS, PPS, bitstream etc. from pPacket and 
 //! calls back pfnDecodePicture with CUVIDPICPARAMS data for kicking of HW decoding
 //! calls back pfnSequenceCallback with CUVIDEOFORMAT data for initial sequence header or when
 //! the decoder encounters a video format change
 //! calls back pfnDisplayPicture with CUVIDPARSERDISPINFO data to display a video frame
 /************************************************************************************************/
 CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket);
 /************************************************************************************************/
 //! \ingroup FUNCTS
 //! \fn CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj)
 //! Destroy the video parser
 /************************************************************************************************/
 CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj);
 /**********************************************************************************************/
 #if defined(__cplusplus)
 }
 #endif /* __cplusplus */
 #endif // __NVCUVID_H__
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/LicenseAgreement.pdf
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/LicenseAgreement.pdf
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/NOTICES.txt
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/NOTICES.txt
@ -0,0 +1,167 @@
 This SDK includes portions of FFMPEG, under the following license:
                   GNU LESSER GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007
 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
  This version of the GNU Lesser General Public License incorporates
 the terms and conditions of version 3 of the GNU General Public
 License, supplemented by the additional permissions listed below.
  0. Additional Definitions.
  As used herein, "this License" refers to version 3 of the GNU Lesser
 General Public License, and the "GNU GPL" refers to version 3 of the GNU
 General Public License.
  "The Library" refers to a covered work governed by this License,
 other than an Application or a Combined Work as defined below.
  An "Application" is any work that makes use of an interface provided
 by the Library, but which is not otherwise based on the Library.
 Defining a subclass of a class defined by the Library is deemed a mode
 of using an interface provided by the Library.
  A "Combined Work" is a work produced by combining or linking an
 Application with the Library.  The particular version of the Library
 with which the Combined Work was made is also called the "Linked
 Version".
  The "Minimal Corresponding Source" for a Combined Work means the
 Corresponding Source for the Combined Work, excluding any source code
 for portions of the Combined Work that, considered in isolation, are
 based on the Application, and not on the Linked Version.
  The "Corresponding Application Code" for a Combined Work means the
 object code and/or source code for the Application, including any data
 and utility programs needed for reproducing the Combined Work from the
 Application, but excluding the System Libraries of the Combined Work.
  1. Exception to Section 3 of the GNU GPL.
  You may convey a covered work under sections 3 and 4 of this License
 without being bound by section 3 of the GNU GPL.
  2. Conveying Modified Versions.
  If you modify a copy of the Library, and, in your modifications, a
 facility refers to a function or data to be supplied by an Application
 that uses the facility (other than as an argument passed when the
 facility is invoked), then you may convey a copy of the modified
 version:
   a) under this License, provided that you make a good faith effort to
   ensure that, in the event an Application does not supply the
   function or data, the facility still operates, and performs
   whatever part of its purpose remains meaningful, or
   b) under the GNU GPL, with none of the additional permissions of
   this License applicable to that copy.
  3. Object Code Incorporating Material from Library Header Files.
  The object code form of an Application may incorporate material from
 a header file that is part of the Library.  You may convey such object
 code under terms of your choice, provided that, if the incorporated
 material is not limited to numerical parameters, data structure
 layouts and accessors, or small macros, inline functions and templates
 (ten or fewer lines in length), you do both of the following:
   a) Give prominent notice with each copy of the object code that the
   Library is used in it and that the Library and its use are
   covered by this License.
   b) Accompany the object code with a copy of the GNU GPL and this license
   document.
  4. Combined Works.
  You may convey a Combined Work under terms of your choice that,
 taken together, effectively do not restrict modification of the
 portions of the Library contained in the Combined Work and reverse
 engineering for debugging such modifications, if you also do each of
 the following:
   a) Give prominent notice with each copy of the Combined Work that
   the Library is used in it and that the Library and its use are
   covered by this License.
   b) Accompany the Combined Work with a copy of the GNU GPL and this license
   document.
   c) For a Combined Work that displays copyright notices during
   execution, include the copyright notice for the Library among
   these notices, as well as a reference directing the user to the
   copies of the GNU GPL and this license document.
   d) Do one of the following:
       0) Convey the Minimal Corresponding Source under the terms of this
       License, and the Corresponding Application Code in a form
       suitable for, and under terms that permit, the user to
       recombine or relink the Application with a modified version of
       the Linked Version to produce a modified Combined Work, in the
       manner specified by section 6 of the GNU GPL for conveying
       Corresponding Source.
       1) Use a suitable shared library mechanism for linking with the
       Library.  A suitable mechanism is one that (a) uses at run time
       a copy of the Library already present on the user's computer
       system, and (b) will operate properly with a modified version
       of the Library that is interface-compatible with the Linked
       Version.
   e) Provide Installation Information, but only if you would otherwise
   be required to provide such information under section 6 of the
   GNU GPL, and only to the extent that such information is
   necessary to install and execute a modified version of the
   Combined Work produced by recombining or relinking the
   Application with a modified version of the Linked Version. (If
   you use option 4d0, the Installation Information must accompany
   the Minimal Corresponding Source and Corresponding Application
   Code. If you use option 4d1, you must provide the Installation
   Information in the manner specified by section 6 of the GNU GPL
   for conveying Corresponding Source.)
  5. Combined Libraries.
  You may place library facilities that are a work based on the
 Library side by side in a single library together with other library
 facilities that are not Applications and are not covered by this
 License, and convey such a combined library under terms of your
 choice, if you do both of the following:
   a) Accompany the combined library with a copy of the same work based
   on the Library, uncombined with any other library facilities,
   conveyed under the terms of this License.
   b) Give prominent notice with the combined library that part of it
   is a work based on the Library, and explaining where to find the
   accompanying uncombined form of the same work.
  6. Revised Versions of the GNU Lesser General Public License.
  The Free Software Foundation may publish revised and/or new versions
 of the GNU Lesser General Public License from time to time. Such new
 versions will be similar in spirit to the present version, but may
 differ in detail to address new problems or concerns.
  Each version is given a distinguishing version number. If the
 Library as you received it specifies that a certain numbered version
 of the GNU Lesser General Public License "or any later version"
 applies to it, you have the option of following the terms and
 conditions either of that published version or of any later version
 published by the Free Software Foundation. If the Library as you
 received it does not specify a version number of the GNU Lesser
 General Public License, you may choose any version of the GNU Lesser
 General Public License ever published by the Free Software Foundation.
  If the Library as you received it specifies that a proxy can decide
 whether future versions of the GNU Lesser General Public License shall
 apply, that proxy's public statement of acceptance of any version is
 permanent authorization for you to choose that version for the
 Library.
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Read_Me.pdf
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Read_Me.pdf
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Release_Notes.pdf
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Release_Notes.pdf
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/rl_readme.txt
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/rl_readme.txt
@ -0,0 +1,9 @@
 Reality Labs notes.
 This is shortened set of NVIDIA video codec SDK.
 Folder Doc, Lib and Samples are removed since we do not use them anyway.
 Only Interface folder and readme and license files are left.
--- a/extern/perceptualdiff/test/Aqsis_vase.png
+++ b/extern/perceptualdiff/test/Aqsis_vase.png
--- a/extern/perceptualdiff/test/Aqsis_vase_ref.png
+++ b/extern/perceptualdiff/test/Aqsis_vase_ref.png
--- a/extern/perceptualdiff/test/Bug1102605.tif
+++ b/extern/perceptualdiff/test/Bug1102605.tif
--- a/extern/perceptualdiff/test/Bug1102605_ref.tif
+++ b/extern/perceptualdiff/test/Bug1102605_ref.tif
--- a/extern/perceptualdiff/test/Bug1471457.tif
+++ b/extern/perceptualdiff/test/Bug1471457.tif
--- a/extern/perceptualdiff/test/Bug1471457_ref.tif
+++ b/extern/perceptualdiff/test/Bug1471457_ref.tif
--- a/extern/perceptualdiff/test/cam_mb.tif
+++ b/extern/perceptualdiff/test/cam_mb.tif
--- a/extern/perceptualdiff/test/cam_mb_ref.tif
+++ b/extern/perceptualdiff/test/cam_mb_ref.tif
--- a/extern/perceptualdiff/test/fish1.png
+++ b/extern/perceptualdiff/test/fish1.png
--- a/extern/perceptualdiff/test/fish2.png
+++ b/extern/perceptualdiff/test/fish2.png
--- a/extern/perceptualdiff/test/run_tests.sh
+++ b/extern/perceptualdiff/test/run_tests.sh
@ -0,0 +1,49 @@
 #!/bin/bash
 # Script to run pdiff against a set of image file pairs, and check that the
 # PASS or FAIL status is as expected.
 #------------------------------------------------------------------------------
 # Image files and expected perceptualdiff PASS/FAIL status.  Line format is
 # (PASS|FAIL) image1.(tif|png) image2.(tif|png)
 #
 # Edit the following lines to add additional tests.
 function all_tests {
 cat <<EOF
 FAIL	Bug1102605_ref.tif	Bug1102605.tif
 PASS	Bug1471457_ref.tif	Bug1471457.tif
 PASS	cam_mb_ref.tif		cam_mb.tif
 FAIL	fish2.png			fish1.png
 EOF
 }
 # Modify pdiffBinary to point to your compiled pdiff executable if desired.
 pdiffBinary=../perceptualdiff
 #------------------------------------------------------------------------------
 totalTests=0
 numTestsFailed=0
 # Run all tests.
 while read expectedResult image1 image2 ; do
 	if $pdiffBinary -verbose $image1 $image2 | grep -q "^$expectedResult" ; then
 		totalTests=$(($totalTests+1))
 	else
 		numTestsFailed=$(($numTestsFailed+1))
 		echo "Regression failure: expected $expectedResult for \"$pdiffBinary $image1 $image2\"" >&2 
 	fi
 done <<EOF
 $(all_tests)
 EOF
 # (the above with the EOF's is a stupid bash trick to stop while from running
 # in a subshell)
 # Give some diagnostics:
 if [[ $numTestsFailed == 0 ]] ; then
 	echo "*** all $totalTests tests passed"
 else
 	echo "*** $numTestsFailed failed tests of $totalTests"
 fi
 exit $numTestsFailed
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@ -276,7 +276,12 @@ if(WITH_CYCLES_DEVICE_METAL)
 endif()
 if(WITH_CYCLES_DEVICE_ONEAPI)
-  add_definitions(-DWITH_ONEAPI)
+# FRL_CLR_BEGIN
  #TODO: T146077207
  message(STATUS "Temporarily turn WITH_ONEAPI off on monaco-cluster branch due to it does not compile there. TODO: fix and put it back. Task: T146077207 ")
 #  add_definitions(-DWITH_ONEAPI)
 # FRL_CLR_END
 endif()
 if(WITH_CYCLES_EMBREE)
--- a/intern/cycles/app/headless_light_client/CMakeLists.txt
+++ b/intern/cycles/app/headless_light_client/CMakeLists.txt
@ -8,94 +8,64 @@ set(INC
  ../../../../extern                  # for flatbuffers
  ${TurboJPEG_INCLUDE_DIRS}
  ${OPENIMAGEIO_INCLUDE_DIR}
 #  ${CUDA_TOOLKIT_INCLUDE}
  ../../cluster_rendering/libcluster
  ../../cluster_rendering/libcluster/compression # for nv_encoder
  ../../cluster_rendering/libnetwork
  ${NVCODEC_PUBLIC_INTERFACE_DIR}
-  )
+)
 if(WITH_WEBRTC)
  list(APPEND INC ../../cluster_rendering/libstream/include)
 endif(WITH_WEBRTC)
 set(INC_SYS
 )
 if(NOT DEFINED NVENCODEAPI_LIB)
  if(WIN32)
    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
      find_library(NVENCODEAPI_LIB NAMES nvencodeapi HINTS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../lib/Video_Codec_SDK_11.1.5/Lib/" PATH_SUFFIXES x64/)
    else()
      find_library(NVENCODEAPI_LIB NAMES nvencodeapi HINTS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../lib/Video_Codec_SDK_11.1.5/Lib/" PATH_SUFFIXES Win32/)
    endif()
  else()
    find_library(NVENCODEAPI_LIB NAMES nvidia-encode HINTS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../lib/Video_Codec_SDK_11.1.5/Lib/" PATH_SUFFIXES linux/stubs/x86_64/)
    if(NOT DEFINED NVENCODEAPI_LIB)
      find_library(NVENCODEAPI_LIB NAMES nvidia-encode HINTS "/Lib64/")
    endif()
  endif()
  if(NVENCODEAPI_LIB)
    message(STATUS "Found NVidia encoding library:${NVENCODEAPI_LIB}")
  else()
    message(WARNING "NVidia encoding library not found.")
  endif()
 else()
  message(STATUS "Using existing NVENCODEAPI_LIB, skipping library search")
 endif()
 find_package(Boost 1.48 COMPONENTS program_options serialization thread filesystem)
 add_definitions(-DHEADLESS_CLIENT)
 set(LIBRARIES
  cycles_libnetwork
  cycles_libcluster
  cycles_libnetwork
  ${TurboJPEG_LIBRARIES}
  ${OPENCOLORIO_LIBRARIES}
  ${OPENIMAGEIO_LIBRARY}
  ${OPENIMAGEIO_UTIL_LIBRARY}
  ${PNG_LIBRARIES}
  ${JPEG_LIBRARIES}
  ${TIFF_LIBRARY}
  ${OPENEXR_LIBRARIES}
  ${OPENJPEG_LIBRARIES}
  ${ZLIB_LIBRARIES}
  ${PUGIXML_LIBRARY}
  ${WEBP_LIBRARIES}
  ${Boost_LIBRARIES}
  ${GLOG_LIBRARIES}
-  
+  bf_intern_libc_compat
 )
-if(WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX)
+if(WITH_CYCLES_DEVICE_CUDA AND CUDA_INCLUDE_DIRS AND UNIX)
-   list(APPEND LIBRARIES ${NVENCODEAPI_LIB})  # For NVCODEC
+  if(NOT DEFINED NVENCODEAPI_LIB)
    find_library(NVENCODEAPI_LIB nvidia-encode)
    find_library(NVCUVID_LIB nvcuvid HINTS "/lib64/") # the same folder on dev laptop and docker image
    if(NOT NVENCODEAPI_LIB)
      message(ERROR "NVidia encoding library not found.")
    endif()
  else()
    message(STATUS "Using pre-defined NVENCODEAPI_LIB: ${NVENCODEAPI_LIB}")
  endif()
  list(APPEND LIBRARIES ${NVENCODEAPI_LIB})
  if(WITH_CUDA_DYNLOAD AND NOT CUDA_CUDA_LIBRARY)
    list(APPEND INC "../../../../extern/cuew/include/")
    list(APPEND LIBRARIES extern_cuew)
    add_definitions(-DWITH_CUDA_DYNLOAD)
  else()
    list(APPEND INC ${CUDA_TOOLKIT_INCLUDE})
    list(APPEND LIBRARIES ${CUDA_CUDA_LIBRARY})
    remove_definitions(-DWITH_CUDA_DYNLOAD)
  endif()
 endif()
 # Only use cuew if CUDA_CUDA_LIBRARY not sepcified
 if(WITH_CUDA_DYNLOAD AND NOT CUDA_CUDA_LIBRARY)
  list(APPEND INC
    ../../../../extern/cuew/include
    )
  list(APPEND LIBRARIES
      extern_cuew
    )
  add_definitions(-DWITH_CUDA_DYNLOAD)
 else()
  list(APPEND INC
    ${CUDA_TOOLKIT_INCLUDE}
    )
  list(APPEND LIBRARIES
      ${CUDA_CUDA_LIBRARY}
    )
  remove_definitions(-DWITH_CUDA_DYNLOAD)
 endif()
 list(APPEND LIBRARIES ${CMAKE_DL_LIBS})
 link_directories(${BOOST_LIBPATH})
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 set(CMAKE_EXE_LINKER_FLAGS "-std=c++11 -pthread")
-file(GLOB SRC *.cpp)
+file(GLOB SRC *.cpp *.cu)
 # Normally we should just link against libcluster and do not compile against source files in libcluster
 # however libcluster pulls tons of dependencies including server/master side dependencies 
 # most of which we do not need what makes this light client not light at all.
 # We may refactor libcluster to split it on client and master/server parts.
 # Since on this light client we only need a functionality of classes listed below
 # we work around dependencies issue by only compiling against following source files we need: 
 list(APPEND SRC ../../cluster_rendering/libcluster/rpc_blender_protocol.cpp)
 list(APPEND SRC ../../cluster_rendering/libcluster/net_camera.cpp)
 add_executable(${TARGET} ${SRC})
 target_link_libraries(${TARGET} ${LIBRARIES})
--- a/intern/cycles/app/headless_light_client/config.cpp
+++ b/intern/cycles/app/headless_light_client/config.cpp
@ -28,8 +28,11 @@ const string Config::SAVE_IMAGES_PATH = "save_images_path";
 const string Config::SAVE_VIDEO_STREAM_PATH = "save_video_stream_path";
 const string Config::USER_EVENTS_PATH = "user_events_path";
 const string Config::MASTER_IMAGE_COLOR_FORMAT = "master_image_color_format";
 const string Config::MASTER_IMAGE_COMPRESSOR = "master_image_compressor";
 const string Config::COLOR_FORMAT_LINEAR = "linear";
 const string Config::COLOR_FORMAT_SRGB = "srgb";
 const string Config::IMAGE_COMPRESSOR_JPEG = "JPEG";
 const string Config::IMAGE_COMPRESSOR_NVENCODER = "NVENCODER";
 Config::Config() : 
@ -55,6 +58,12 @@ bool Config::init(int argc, char* argv[]) {
    COLOR_FORMAT_SRGB + ". " +
    "Default is: " + COLOR_FORMAT_SRGB;
  const std::string supported_image_compressors =
    IMAGE_COMPRESSOR_JPEG + ", " +
    IMAGE_COMPRESSOR_NVENCODER + ". " +
    "Default is: " + IMAGE_COMPRESSOR_JPEG;
  namespace po = boost::program_options;
  po::options_description options("Allowed options");
  options.add_options()
@ -78,6 +87,10 @@ if not provided, client uses the resolution net camera object was saved with")
    ((USER_EVENTS_PATH + ",u").c_str(), po::value<string>(), "path to a file with user events")
    ((MASTER_IMAGE_COLOR_FORMAT + ",i").c_str(), po::value<string>(),
      ("master image color format. Options: " + supported_color_formats).c_str())
    ((MASTER_IMAGE_COMPRESSOR + ",i").c_str(), po::value<string>(),
      ("master image compressor. Options: " + supported_image_compressors).c_str())
    ((DENOISER + ",d").c_str(), po::value<string>(), denoiser_help.c_str());
  po::variables_map vm;
  po::store(po::parse_command_line(argc, argv, options), vm);
@ -191,6 +204,19 @@ if not provided, client uses the resolution net camera object was saved with")
    }
  }
  if (vm.count(MASTER_IMAGE_COMPRESSOR)) {
    string master_image_compressor_str = vm[MASTER_IMAGE_COMPRESSOR].as<string>();
    if(master_image_compressor_str == IMAGE_COMPRESSOR_JPEG) {
      master_image_compressor = ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_JPEG;
    } else if(master_image_compressor_str == IMAGE_COMPRESSOR_NVENCODER) {
      master_image_compressor = ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_NVENCODER;
    } else {
      VLOG(1) << "FATAL. Requested image color format is not supported. Requested: " << master_image_compressor_str <<
        " Supported: " << supported_color_formats;
      return false;
    }
  }
  return true;
 }
--- a/intern/cycles/app/headless_light_client/config.h
+++ b/intern/cycles/app/headless_light_client/config.h
@ -35,8 +35,11 @@ class Config {
  static const string SAVE_VIDEO_STREAM_PATH;
  static const string USER_EVENTS_PATH;
  static const string MASTER_IMAGE_COLOR_FORMAT;
  static const string MASTER_IMAGE_COMPRESSOR;
  static const string COLOR_FORMAT_LINEAR;
  static const string COLOR_FORMAT_SRGB;
  static const string IMAGE_COMPRESSOR_JPEG;
  static const string IMAGE_COMPRESSOR_NVENCODER;
 public:
  size_t start_frame_id = 0;
@ -60,6 +63,8 @@ public:
  ClusterSessionParams::MasterDenoiser master_denoiser;
  ClusterSessionParams::MasterImageColorFormat master_image_color_format =
    ClusterSessionParams::MasterImageColorFormat::MASTER_IMAGE_COLOR_FORMAT_SRGB;
  ClusterSessionParams::MasterImageCompressor master_image_compressor =
    ClusterSessionParams::ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_JPEG;
  Config();
  ~Config();
--- a/intern/cycles/app/headless_light_client/light_cluster_rendering_client.cpp
+++ b/intern/cycles/app/headless_light_client/light_cluster_rendering_client.cpp
@ -8,15 +8,23 @@
 #include "camera_provider.h"
 #include "cluster_session_params.h"
-#ifdef WITH_OPTIX
+#ifdef WITH_CUDA
-#include "cuda_context.h"
+#include "cuda_context_provider.h"
 #endif
 #include "image_io_util.h"
 #include "light_cluster_rendering_client.h"
 #include "modify_object_message.h"
 #include "net_camera.h"
 #include "user_events_provider.h"
 #ifdef WITH_CUDA
 using cgr_libcluster::CudaContextProvider;
 using cgr_libcluster::CUDAContextScope;
 #endif
 using cgr_libcluster::ImageIOUtil;
 using cluster_rendering_net_lib::Retry;
 using OpenImageIO_v2_4::ImageOutput;
 using OpenImageIO_v2_4::TypeDesc;
 using namespace std::chrono;
@ -25,6 +33,7 @@ namespace headless_light_client {
 const std::string LightClusterRenderingClient::IMAGE = "Image";
 const std::string LightClusterRenderingClient::CAMERA = "Camera";
 const std::string LightClusterRenderingClient::JPG = ".jpg";
 const std::string LightClusterRenderingClient::PNG = ".png";
 const double LightClusterRenderingClient::WAIT_LAST_FRAME_TIMEOUT_MS = 1500;
 LightClusterRenderingClient::LightClusterRenderingClient(const Config & config, CameraProvider & camera_provider,
@ -42,16 +51,12 @@ LightClusterRenderingClient::LightClusterRenderingClient(const Config & config,
  image_client_events_handler(IMAGE),
  netlib_camera_client(camera_client_events_handler),
  netlib_image_client(image_client_events_handler)
 #ifdef WITH_OPTIX
  ,encoded_image(INITIAL_SIZE_OF_BUFFER_FOR_ENCODED_IMAGE)
 #endif
 {
 }
 LightClusterRenderingClient::~LightClusterRenderingClient() {
 #ifdef WITH_OPTIX
  video_stream_file.close();
 #endif
 }
 void LightClusterRenderingClient::run() {
@ -69,10 +74,12 @@ void LightClusterRenderingClient::run() {
      camera.cam_width = config.frame_width;
      camera.cam_height = config.frame_height;
    }
-    VLOG(3) << "client send camera for frame: " << camera.frame << " cam_width: " << camera.cam_width << " cam_height: " << camera.cam_height;
+    VLOG(3) << "client send camera for frame: " << camera.frame << " cam_width: " << camera.cam_width <<
      " cam_height: " << camera.cam_height;
    camera.sampleCount = samples_count;
    camera.master_denoiser = master_denoiser;
    camera.master_image_color_format = config.master_image_color_format;
    camera.master_image_compressor = config.master_image_compressor;
    if(is_first_camera) {
      camera.frame = 0;
      is_first_camera = false;
@ -127,9 +134,9 @@ void LightClusterRenderingClient::nextCameraDelay(const int frame_id) {
  }
 }
-#ifdef WITH_OPTIX
+#ifdef WITH_CUDA
 void LightClusterRenderingClient::encodeImage(NvEncoder * nv_encoder_ptr,
-      const std::vector<uint8_t> & raw_image, AutoEnlargingBuffer<uint8_t> & encoded_image_out) {
+    const std::vector<uint8_t> & raw_image, std::vector<uint8_t> & encoded_image_out) {
  auto start = high_resolution_clock::now();
  if(nv_encoder_ptr->encode(raw_image.data(), encoded_image_out)) {
    std::chrono::duration<float> duration_sec = high_resolution_clock::now() - start;
@ -140,9 +147,13 @@ void LightClusterRenderingClient::encodeImage(NvEncoder * nv_encoder_ptr,
    throw std::runtime_error(message);
  }
 }
 #endif
 void LightClusterRenderingClient::writeImageToVideoStreamFile(std::ofstream &video_stream_file,
-  const AutoEnlargingBuffer<uint8_t> &encoded_image) {
+    const std::vector<uint8_t> &encoded_image) {
  if(!video_stream_file.is_open()) {
    video_stream_file.open(config.save_video_stream_path, std::ios::app | std::ios::binary);
  }
  if(video_stream_file.is_open()) {
    video_stream_file.write(reinterpret_cast<const char*>(encoded_image.data()), encoded_image.size());
    VLOG(3) << "Wrote encoded image of size: " << encoded_image.size();
@ -151,7 +162,7 @@ void LightClusterRenderingClient::writeImageToVideoStreamFile(std::ofstream &vid
    throw std::invalid_argument(message);
  }
 }
-#endif
+
 // Simple RGB to RGBA convertion.
 // Headless client gets image in jpeg format from the master and decompress it into RGB format
 // This method is used for convertion from RGB to RGBA since NVEncoder expects RGBA flavor
@ -181,6 +192,8 @@ void LightClusterRenderingClient::imageChannelLoop() {
        VLOG(3) << "Image channel received from master buffer size: " << buf_size;
        NetCamera net_camera;
        netlib_image_client.receive(&net_camera, sizeof(NetCamera));
        // log line below is used by get_metrics.py to calculate stats. If you change it
        // please make sure get_metrics.py still works correctly. Update if needed.
        VLOG(3) << "net camera received for frame: " << net_camera.frame << " "
          << net_camera.cam_width << " " << net_camera.cam_height;
        std::unique_ptr<std::vector<uint8_t> > image_buffer_uptr(new std::vector<uint8_t>(buf_size));
@ -190,6 +203,7 @@ void LightClusterRenderingClient::imageChannelLoop() {
        received_image_from_master = true;
        ++num_received_frames;
        outputImageIfRequested(image_buffer_uptr.get(), net_camera);
        outputVideoStreamIfRequested(image_buffer_uptr.get(), net_camera);
        image_cv.notify_one();
      }
    } else {
@ -201,49 +215,88 @@ void LightClusterRenderingClient::imageChannelLoop() {
  VLOG(3) << "Finished image channel thread.";
 }
-void LightClusterRenderingClient::outputImageIfRequested(std::vector<uint8_t> * jpeg_image, const NetCamera & net_camera) {
+void LightClusterRenderingClient::outputImageIfRequested(std::vector<uint8_t> * image, const NetCamera & net_camera) {
-  if(config.save_images_path.length() < 1 && config.save_video_stream_path.length() < 1) {
+  if(config.save_images_path.length() < 1) {
-    return; // no output is requested, return;
+    return; // no image output is requested, return;
  }
-  if(jpeg_image == nullptr) {
+  if(net_camera.master_image_compressor ==
-    std::string message("FATAL. Cannot output an image, a pointer to the image is null");
+      ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_JPEG) {
-    VLOG(3) << message;
+    if(image == nullptr) {
-    throw std::runtime_error(message);
+      std::string message("FATAL. Cannot output an image, a pointer to the image is null");
-  }
+      VLOG(3) << message;
-  uint8_t * flipped_jpeg = nullptr;
+      throw std::runtime_error(message);
  unsigned long flipped_jpeg_size = 0;
  if(jpeg_tools_ptr == nullptr) {
    jpeg_tools_ptr.reset(new JpegTools());
  }
  // Images that we receive from the master are upside down, we need to flip images before saving
  jpeg_tools_ptr->flipJpegImage(jpeg_image->data(), jpeg_image->size(), &flipped_jpeg, flipped_jpeg_size);
  if(config.save_images_path.length() > 0) {
    // Saving of noisy images (which are significantly larger due to solt & papper noise)
    // to /mnt/graphics_ssd may take about 30 ms what limits read image thread to about 30 fps.
    // Since we do not want IO operations to affect averall system performance on hight fps
    // we save images in a different thread
    boost::asio::post(save_image_thread_pool, std::bind(&LightClusterRenderingClient::saveImage, this,
      getImagePath(net_camera.frame, JPG), flipped_jpeg, flipped_jpeg_size));
  }
  if(config.save_video_stream_path.length() > 0) {
 #ifdef WITH_OPTIX
    if(!nv_encoder_ptr) {
      // create NVEncoder once we got the first frame with width and height of the image
      VLOG(3) << "Creating NvEncoder";
      nv_encoder_ptr.reset(new NvEncoder(NvEncoder::BUFFER_FORMAT_FOR_IMAGES_FROM_MASTER,
        CudaContext::createCudaContext(),
        net_camera.cam_width, net_camera.cam_height));
      video_stream_file.open(config.save_video_stream_path, std::ios::app | std::ios::binary);
    }
    uint8_t * flipped_jpeg = nullptr;
    unsigned long flipped_jpeg_size = 0;
    if(jpeg_tools_ptr == nullptr) {
      jpeg_tools_ptr.reset(new JpegTools());
    }
    // Images that we receive from the master are upside down, we need to flip images before saving
    jpeg_tools_ptr->flipJpegImage(image->data(), image->size(), &flipped_jpeg, flipped_jpeg_size);
      // Saving of noisy images (which are significantly larger due to solt & papper noise)
      // to /mnt/graphics_ssd may take about 30 ms what limits read image thread to about 30 fps.
      // Since we do not want IO operations to affect averall system performance on hight fps
      // we save images in a different thread
      boost::asio::post(save_image_thread_pool, std::bind(&LightClusterRenderingClient::saveImageAsIs, this,
        getImagePath(net_camera.frame, JPG), flipped_jpeg, flipped_jpeg_size));
  }
 #ifdef WITH_CUDA
  else if(net_camera.master_image_compressor ==
      ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_NVENCODER) {
    if(!nv_decoder_ptr) {
      VLOG(3) << "Creating nvdecoder";
        CUcontext cuda_context = CudaContextProvider::getPrimaryContext(CUDA_DEVICE_NUM);
        const CUDAContextScope scope(cuda_context);
        nv_decoder_ptr.reset(new NvDecoder(cuda_context));
    }
    nv_decoder_ptr->decode(*image, net_camera.frame, &decoded_image);
    boost::asio::post(save_image_thread_pool, std::bind(&LightClusterRenderingClient::saveImageInFormat, this,
      PNG, net_camera.frame, decoded_image.data(), net_camera.cam_width, net_camera.cam_height));
  }
 #endif
  else {
    VLOG(1) << "Cannot save image, unknown compression: " << net_camera.master_image_compressor;
  }
 }
 void LightClusterRenderingClient::outputVideoStreamIfRequested(std::vector<uint8_t> * image, const NetCamera & net_camera) {
  if(config.save_video_stream_path.length() < 1) {
    return; // no video stream output is requested, return;
  }
  if(net_camera.master_image_compressor ==
      ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_JPEG) {
 #ifdef WITH_CUDA
    if(!nv_encoder_ptr) {
      VLOG(3) << "Creating NvEncoder";
      CUcontext cuda_context = CudaContextProvider::getPrimaryContext(CUDA_DEVICE_NUM);
      const CUDAContextScope scope(cuda_context);
      nv_encoder_ptr.reset(new NvEncoder(NV_ENC_BUFFER_FORMAT_ABGR,
        cuda_context,
        net_camera.cam_width, net_camera.cam_height));
    }
    std::vector<uint8_t> decompressed_image_rgb(net_camera.cam_width * net_camera.cam_height * 3);
    std::vector<uint8_t> decompressed_image_rgba(net_camera.cam_width * net_camera.cam_height * 4);
    uint8_t * flipped_jpeg = nullptr;
    unsigned long flipped_jpeg_size = 0;
    if(jpeg_tools_ptr == nullptr) {
      jpeg_tools_ptr.reset(new JpegTools());
    }
    // Images that we receive from the master are upside down, we need to flip images before saving
    jpeg_tools_ptr->flipJpegImage(image->data(), image->size(), &flipped_jpeg, flipped_jpeg_size);
    jpeg_tools_ptr->decompressJpeg(flipped_jpeg, flipped_jpeg_size, decompressed_image_rgb);
    rgbToRgba(decompressed_image_rgb, decompressed_image_rgba);
 #ifdef WITH_OPTIX
    encodeImage(nv_encoder_ptr.get(), decompressed_image_rgba, encoded_image);
    writeImageToVideoStreamFile(video_stream_file, encoded_image);
 #else
  throw std::runtime_error("ERROR. Client is compiled without CUDA support so has no nvencoder and\
 cannot encode received image and save video stream");
 #endif
  } else if(net_camera.master_image_compressor ==
      ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_NVENCODER) {
    // image is already video-encoded by nvencoder, save it as is
    writeImageToVideoStreamFile(video_stream_file, *image);
  } else {
    VLOG(1) << "Cannot save video stream, unknown compressor: " << net_camera.master_image_compressor;
  }
 }
@ -254,17 +307,24 @@ std::string LightClusterRenderingClient::getImagePath(const int frame_id, const
  return path_ostream.str();
 }
-void LightClusterRenderingClient::saveImage(const std::string file_path, unsigned char * jpeg_image, int jpeg_image_size) {
+void LightClusterRenderingClient::saveImageAsIs(const std::string file_path, unsigned char * image, int image_size) {
  VLOG(3) << "Saving image into: " << file_path;
  auto start = high_resolution_clock::now();
  std::ofstream wf(file_path, std::ios::out | std::ios::binary);
-  wf.write(reinterpret_cast<const char*>(jpeg_image), jpeg_image_size);
+  wf.write(reinterpret_cast<const char*>(image), image_size);
  wf.close();
  auto end = high_resolution_clock::now();
  double time_taken = duration_cast<milliseconds>(end - start).count();
  VLOG(3) << "Image saved successfully. Save image time: " << time_taken << " ms";
 }
 void LightClusterRenderingClient::saveImageInFormat(std::string format_extension, int frame_id, void * image,
    int width, int height) {
  std::string file_path = getImagePath(frame_id, format_extension);
  std::unique_ptr<ImageOutput> image_output = std::unique_ptr<ImageOutput>(ImageOutput::create(file_path));
  ImageIOUtil::saveFrame(file_path, TypeDesc::UCHAR, image_output.get(), image, width, height);
 }
 bool LightClusterRenderingClient::connectToMaster() {
  bool connected = false;
--- a/intern/cycles/app/headless_light_client/light_cluster_rendering_client.h
+++ b/intern/cycles/app/headless_light_client/light_cluster_rendering_client.h
@ -11,8 +11,9 @@
 #include "jpeg_tools.h"
-#ifdef WITH_OPTIX
+#ifdef WITH_CUDA
 #include "nv_encoder.h"
 #include "nv_decoder.h"
 #endif
 #include "camera_provider.h"
@ -20,14 +21,15 @@
 #include "netclient.h"
 #include "netlib_event_handler.h"
 #include "rpc_blender_protocol.h"
 #include "./utils/vector_types.h"    // for uchar3
 namespace headless_light_client {
 using cgr_libcluster::ClusterSessionParams;
 using cgr_libcluster::RpcBlenderProtocol;
-#ifdef WITH_OPTIX
+#ifdef WITH_CUDA
 using cgr_libcluster::NvDecoder;
 using cgr_libcluster::NvEncoder;
 using cgr_libcluster::AutoEnlargingBuffer;
 #endif
 class CameraProvider;
@ -51,6 +53,8 @@ private:
  static const std::string IMAGE;
  static const std::string CAMERA;
  static const std::string JPG;
  static const std::string PNG;
  static const size_t CUDA_DEVICE_NUM = 0;
  static const size_t MAX_NUM_RETRIES = 5;
  static const size_t RETRY_INTERVAL_MS = 2000;
  static const size_t INITIAL_SIZE_OF_BUFFER_FOR_ENCODED_IMAGE = 50000;
@ -81,11 +85,13 @@ private:
  std::condition_variable image_cv;
  std::atomic<bool> received_image_from_master = false;
  std::unique_ptr<JpegTools> jpeg_tools_ptr;
-#ifdef WITH_OPTIX
+  #ifdef WITH_CUDA
-  std::unique_ptr<NvEncoder>  nv_encoder_ptr;
+  std::unique_ptr<NvEncoder> nv_encoder_ptr;
-  AutoEnlargingBuffer<uint8_t> encoded_image;
+  std::unique_ptr<NvDecoder> nv_decoder_ptr;
  #endif
  std::vector<uint8_t> encoded_image;
  std::vector<cgr_libcluster::uchar3> decoded_image;
  std::ofstream video_stream_file;
 #endif
  boost::asio::thread_pool save_image_thread_pool = boost::asio::thread_pool(4);
@ -99,13 +105,16 @@ private:
  void imageChannelLoop();
  void nextCameraDelay(const int frame_id);
  std::string getImagePath(const int frame_id, const std::string & file_extension);
-  void outputImageIfRequested(uint8_t * jpeg_image, int jpeg_image_size, const NetCamera & net_camera);
+  void outputVideoStreamIfRequested(std::vector<uint8_t> * image, const NetCamera & net_camera);
-  void outputImageIfRequested(std::vector<uint8_t> * jpeg_image, const NetCamera & net_camera);
+  void outputImageIfRequested(std::vector<uint8_t> * image, const NetCamera & net_camera);
-  void saveImage(const std::string file_path, unsigned char * jpeg_image, int jpeg_image_size);
+  void saveImageAsIs(const std::string file_path, unsigned char * image, int image_size);
-#ifdef WITH_OPTIX
+  // Saves image in a format specified as a file extention
-  void writeImageToVideoStreamFile(std::ofstream &video_stream_file, const AutoEnlargingBuffer<uint8_t> &encoded_image);
+  void saveImageInFormat(std::string format_extension, int frame_id, void * image, int width, int height);
  void writeImageToVideoStreamFile(std::ofstream &video_stream_file,
    const std::vector<uint8_t> &encoded_image);
 #ifdef WITH_CUDA
  void encodeImage(NvEncoder * nv_encoder_ptr, const std::vector<uint8_t> & raw_image,
-    AutoEnlargingBuffer<uint8_t> & encoded_image_out);
+    std::vector<uint8_t> & encoded_image_out);
 #endif
 };
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@ -163,6 +163,7 @@ class  CYCLES_RENDER_PT_render_session_mode(CyclesButtonsPanel, Panel):
        netsub.enabled = net.enabled and render.render_session_mode == 'MASTER'
        netsub.prop(render, "num_servers")
        netport.prop(render, "master_image_color_format")
        netport.prop(render, "master_image_compressor")
        #Temp turned off. TODO: [pmishchuk] enable when ready
        # import _cycles
        # if _cycles.with_webrtc:
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@ -871,6 +871,13 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
    // use default color format.
    master_image_color_format_num = cgr_libcluster::ClusterSessionParams::DEFAULT_MASTER_IMAGE_COLOR_FORMAT;
  }
  int master_image_compressor_num = r.master_image_compressor();
  if (master_image_compressor_num <= cgr_libcluster::ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_NONE ||
      master_image_compressor_num >= cgr_libcluster::ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_END) {
    // Blend file has no saved value for image compressor (it was saved before we added this property)
    // use default color format.
    master_image_compressor_num = cgr_libcluster::ClusterSessionParams::DEFAULT_MASTER_IMAGE_COMPRESSOR;
  }
  params.cluster_session_params = cgr_libcluster::ClusterSessionParams(
    static_cast<cgr_libcluster::ClusterSessionParams::SessionMode>(r.render_session_mode()),
@ -878,7 +885,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
    static_cast<cgr_libcluster::ClusterSessionParams::MasterDenoiser>(r.master_denoiser()),
    r.save_denoise_io(), r.save_streamed_image(), save_every_n_images,
    r.save_cameras(), r.filepath(),
-    static_cast<cgr_libcluster::ClusterSessionParams::MasterImageColorFormat>(master_image_color_format_num));
+    static_cast<cgr_libcluster::ClusterSessionParams::MasterImageColorFormat>(master_image_color_format_num),
    static_cast<cgr_libcluster::ClusterSessionParams::MasterImageCompressor>(master_image_compressor_num));
  params.cluster_session_params.fps = r.fps();
  cgr_libcluster::ModifyObjectParams & modify_object_params = params.cluster_session_params.modify_object_params;
--- a/intern/cycles/cluster_rendering/CMakeLists.txt
+++ b/intern/cycles/cluster_rendering/CMakeLists.txt
@ -1,5 +1,5 @@
 if(WITH_CYCLES_DEVICE_CUDA AND CUDA_INCLUDE_DIRS)
  add_subdirectory(libcluster_cuda_kernels)
 endif()
 add_subdirectory(libcluster)
 add_subdirectory(libnetwork)
 if(WITH_WEBRTC)
  add_subdirectory(libstream)
 endif(WITH_WEBRTC)
--- a/intern/cycles/cluster_rendering/libcluster/CMakeLists.txt
+++ b/intern/cycles/cluster_rendering/libcluster/CMakeLists.txt
@ -9,7 +9,9 @@ add_subdirectory(test)
 #SET(OIDN_FOLDER ${OIDN_PATH}/oidn.linux)
 #endif()
-find_package(Boost 1.48 COMPONENTS serialization REQUIRED)
+# From William: find_package(Boost 1.48 COMPONENTS serialization REQUIRED)
 # Does not work on Windows at the moment, so turning this REQUIRED off
 find_package(Boost 1.48 COMPONENTS serialization)
 find_package(TurboJPEG REQUIRED)
 # Uncomment instead of the above if using VCPKG
 # find_package(libjpeg-turbo REQUIRED PATHS)
@ -54,14 +56,20 @@ set(SRC
  net_client.cpp
  rpc_blender_protocol.cpp
  image_io_util.cpp
  compression/turbojpeg_compressor.cpp
  utils/timer.cpp
  utils/vector_types.cpp
  utils/image.cpp
  denoising/denoise_buffer.cpp
  denoising/denoising_context.cpp
  denoising/master_denoiser.cpp
  denoising/master_oidn_denoiser.cpp
 )
 if(WITH_CYCLES_DEVICE_OPTIX)
  list(APPEND SRC denoising/master_optix_denoiser.cpp)
 endif()
 set(FLAT_BUFFER_FILES
  net_camera.fbs
 )
@ -140,6 +148,7 @@ set(SRC_HEADERS
  utils/logging.h
  utils/timer.h
  utils/vector_types.h
  utils/image.h
  denoising/denoise_buffer.h
  denoising/denoising_context.h
  denoising/master_denoiser.h
@ -152,22 +161,31 @@ set(LIB
  ${Boost_LIBRARIES}
  ${TurboJPEG_LIBRARIES}
  flatbuffers
 #  rt
 )
-if(WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX)
+if(WITH_CYCLES_DEVICE_OPTIX)
-  message(STATUS "Using CUDA adding nv_encoder")
+  list(APPEND INC ${OPTIX_INCLUDE_DIRS})
-  list(APPEND INC   ${OPTIX_INCLUDE_DIRS})
+endif()
-  list(APPEND SRC denoising/master_optix_denoiser.cpp compression/nv_encoder.cpp)
+
-  list(APPEND SRC_HEADERS denoising/master_optix_denoiser.h compression/nv_encoder.h)
+if(WITH_CYCLES_DEVICE_CUDA AND CUDA_INCLUDE_DIRS AND UNIX)
  message(STATUS "Building with CUDA, adding nv_encoder to the libcluster")
  find_library(NVENCODEAPI_LIB nvidia-encode)
  find_library(NVCUVID_LIB nvcuvid HINTS "/lib64/") # the same folder on dev laptop and docker image
  list(APPEND SRC compression/nv_encoder.cpp)
  list(APPEND SRC_HEADERS compression/nv_encoder.h)
  list(APPEND SRC compression/nv_decoder.cpp)
  list(APPEND SRC_HEADERS compression/nv_decoder.h)
  list(APPEND LIB cycles_libcluster_cuda_kernels)
  list(APPEND LIB ${NVENCODEAPI_LIB})
  list(APPEND LIB ${NVCUVID_LIB})
  if(WITH_CUDA_DYNLOAD)
-    list(APPEND LIB
+    list(APPEND LIB extern_cuew)
      extern_cuew
    )
  else()
-    list(APPEND LIB
+    list(APPEND LIB ${CUDA_CUDA_LIBRARY})
      ${CUDA_CUDA_LIBRARY}
    )
  endif()
 else()
  message(STATUS "No CUDA or we are not on Linux so building libcluster without nv_encoder")
 endif()
 #if(APPLE)
--- a/intern/cycles/cluster_rendering/libcluster/cluster_session_params.h
+++ b/intern/cycles/cluster_rendering/libcluster/cluster_session_params.h
@ -61,12 +61,24 @@ public:
    // END must go last
    MASTER_IMAGE_COLOR_FORMAT_END,
  };
  enum MasterImageCompressor{
    MASTER_IMAGE_COMPRESSOR_NONE = 0,
    // Add new options here between _NONE and _END
    // List of formats must be in sync with list in DNA_scene_types.h
    MASTER_IMAGE_COMPRESSOR_JPEG = 1,
    MASTER_IMAGE_COMPRESSOR_NVENCODER = 2,
    // END must go last
    MASTER_IMAGE_COMPRESSOR_END,
  };
  static const MasterImageColorFormat DEFAULT_MASTER_IMAGE_COLOR_FORMAT = MASTER_IMAGE_COLOR_FORMAT_LINEAR;
  static const MasterImageCompressor DEFAULT_MASTER_IMAGE_COMPRESSOR = MASTER_IMAGE_COMPRESSOR_JPEG;
  ClusterSessionParams(SessionMode session_mode, std::string master_address, int master_port, int num_servers,
      int master_compression_quality, MasterDenoiser master_denoiser, bool save_denoise_io, bool save_streamed_image,
      int save_every_n_images, bool save_cameras, const std::string & output_folder_path,
-      MasterImageColorFormat master_image_color_format) :
+      MasterImageColorFormat master_image_color_format, MasterImageCompressor master_image_compressor) :
    session_mode(session_mode),
    master_address(master_address),
    master_port(master_port),
@ -74,6 +86,7 @@ public:
    master_compression_quality(master_compression_quality),
    master_denoiser(master_denoiser),
    master_image_color_format(master_image_color_format),
    master_image_compressor(master_image_compressor),
    save_denoise_io(save_denoise_io),
    save_streamed_image(save_streamed_image),
    save_every_n_images(save_every_n_images),
@ -89,6 +102,7 @@ public:
    master_compression_quality = 85;
    master_denoiser = MASTER_DENOISER_NONE;
    master_image_color_format = MASTER_IMAGE_COLOR_FORMAT_LINEAR;
    master_image_compressor = MASTER_IMAGE_COMPRESSOR_JPEG;
  }
  bool modified(const ClusterSessionParams &params) const
@ -103,6 +117,7 @@ public:
             save_every_n_images == params.save_every_n_images &&
             save_cameras == params.save_cameras &&
             master_image_color_format == params.master_image_color_format &&
             master_image_compressor == params.master_image_compressor &&
             output_folder_path == params.output_folder_path);
    // modified method should not compare modify_object_params.
    // When modified method returns true a session is restarted.
@ -116,6 +131,7 @@ public:
  int master_compression_quality;
  MasterDenoiser master_denoiser;
  MasterImageColorFormat master_image_color_format;
  MasterImageCompressor master_image_compressor;
  bool save_denoise_io = false;
  bool save_streamed_image = false;
  // When save_every_n_images is 10, we only save 0th, 10th, 20th, 30th images
--- a/intern/cycles/cluster_rendering/libcluster/compression/nv_decoder.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/compression/nv_decoder.cpp
@ -0,0 +1,262 @@
 #include <chrono>
 #include <cmath>
 #include <stdexcept>
 #include <string>
 #include "nv_decoder.h"
 #include "../cuda_context_provider.h"
 #include "../utils/cuda_utils.h"
 #include "../utils/image.h"
 namespace cgr_libcluster {
 static const char * getVideoCodecString(cudaVideoCodec video_codec) {
  static struct {
    cudaVideoCodec codec;
    const char *name;
  } CodecToName [] = {
    { cudaVideoCodec_MPEG1,     "MPEG-1"       },
    { cudaVideoCodec_MPEG2,     "MPEG-2"       },
    { cudaVideoCodec_MPEG4,     "MPEG-4 (ASP)" },
    { cudaVideoCodec_VC1,       "VC-1/WMV"     },
    { cudaVideoCodec_H264,      "AVC/H.264"    },
    { cudaVideoCodec_JPEG,      "M-JPEG"       },
    { cudaVideoCodec_H264_SVC,  "H.264/SVC"    },
    { cudaVideoCodec_H264_MVC,  "H.264/MVC"    },
    { cudaVideoCodec_HEVC,      "H.265/HEVC"   },
    { cudaVideoCodec_VP8,       "VP8"          },
    { cudaVideoCodec_VP9,       "VP9"          },
    { cudaVideoCodec_AV1,       "AV1"          },
    { cudaVideoCodec_NumCodecs, "Invalid"      },
    { cudaVideoCodec_YUV420,    "YUV  4:2:0"   },
    { cudaVideoCodec_YV12,      "YV12 4:2:0"   },
    { cudaVideoCodec_NV12,      "NV12 4:2:0"   },
    { cudaVideoCodec_YUYV,      "YUYV 4:2:2"   },
    { cudaVideoCodec_UYVY,      "UYVY 4:2:2"   },
  };
  if (video_codec >= 0 && video_codec <= cudaVideoCodec_NumCodecs) {
    return CodecToName[video_codec].name;
  }
  for (int i = cudaVideoCodec_NumCodecs + 1; i < sizeof(CodecToName) / sizeof(CodecToName[0]); i++) {
    if (video_codec == CodecToName[i].codec) {
      return CodecToName[video_codec].name;
    }
  }
  return "Unknown";
 }
 static const char * getVideoChromaFormatString(cudaVideoChromaFormat chroma_format) {
  static struct {
    cudaVideoChromaFormat chroma_format;
    const char *name;
  } ChromaFormatToName[] = {
    { cudaVideoChromaFormat_Monochrome, "YUV 400 (Monochrome)" },
    { cudaVideoChromaFormat_420,        "YUV 420"              },
    { cudaVideoChromaFormat_422,        "YUV 422"              },
    { cudaVideoChromaFormat_444,        "YUV 444"              },
  };
  if (chroma_format >= 0 && chroma_format < sizeof(ChromaFormatToName) / sizeof(ChromaFormatToName[0])) {
    return ChromaFormatToName[chroma_format].name;
  }
  return "Unknown";
 }
 static float getChromaHeightFactor(cudaVideoSurfaceFormat surface_format) {
  float factor = 0.5;
  switch (surface_format) {
    case cudaVideoSurfaceFormat_NV12:
    case cudaVideoSurfaceFormat_P016:
          factor = 0.5;
          break;
    case cudaVideoSurfaceFormat_YUV444:
    case cudaVideoSurfaceFormat_YUV444_16Bit:
          factor = 1.0;
          break;
  }
  return factor;
 }
 static int getChromaPlaneCount(cudaVideoSurfaceFormat surface_format) {
  int numPlane = 1;
  switch (surface_format) {
    case cudaVideoSurfaceFormat_NV12:
    case cudaVideoSurfaceFormat_P016:
          numPlane = 1;
          break;
    case cudaVideoSurfaceFormat_YUV444:
    case cudaVideoSurfaceFormat_YUV444_16Bit:
          numPlane = 2;
          break;
  }
  return numPlane;
 }
 NvDecoder::NvDecoder(CUcontext cuda_context):
    m_cuda_context(cuda_context) {
  LOG(INFO) << "NvDecoder constructor. Creating video parser";
  CUVIDPARSERPARAMS videoParserParameters = {};
  videoParserParameters.CodecType = cudaVideoCodec_H264;
  videoParserParameters.ulMaxNumDecodeSurfaces = 1;
  videoParserParameters.ulClockRate = 1000;
  constexpr int low_latency_display_delay = 0;
  videoParserParameters.ulMaxDisplayDelay = low_latency_display_delay;
  videoParserParameters.pUserData = this;
  videoParserParameters.pfnSequenceCallback = HandleVideoSequenceCallback;
  videoParserParameters.pfnDecodePicture = HandlePictureDecodeCallback;
  videoParserParameters.pfnDisplayPicture = HandlePictureDisplayCallback;
  CUDA_API_CALL(cuvidCreateVideoParser(&m_parser, &videoParserParameters), THROW_IF_ERROR);
  LOG(INFO) << "Created video parser";
 }
 NvDecoder::~NvDecoder() {
  LOG(INFO) << "NvDecoder destructor";
  if (m_parser) {
    LOG(INFO) << "Destroying video parser";
    CUDA_API_CALL(cuvidDestroyVideoParser(m_parser), DO_NOT_THROW);
  }
  if (m_decoder) {
    LOG(INFO) << "Destroying video decoder";
    CUDA_API_CALL(cuvidDestroyDecoder(m_decoder), DO_NOT_THROW);
  }
  LOG(INFO) << "NvDecoder released resources and is destroyed";
 }
 // Return values:
 // 0 :  fail
 // 1 :  succeeded, but driver should not override CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces
 // >1:  succeeded, and driver should override CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces with this return value
 int NvDecoder::HandleVideoSequence(CUVIDEOFORMAT *vdeo_format){
  LOG(INFO) << "HandleVideoSequence callback. Creating nvdecoder";
  LOG(INFO) << "Video Input Information:";
  LOG(INFO) << "\tCodec        : "  << getVideoCodecString(vdeo_format->codec);
  LOG(INFO) << "\tSequence     : "  << (vdeo_format->progressive_sequence ? "Progressive" : "Interlaced");
  LOG(INFO) << "\tCoded size   : [" << vdeo_format->coded_width << ", " << vdeo_format->coded_height << "]";
  LOG(INFO) << "\tDisplay area : [" << vdeo_format->display_area.left << ", " << vdeo_format->display_area.top << ", "
            << vdeo_format->display_area.right << ", " << vdeo_format->display_area.bottom << "]";
  LOG(INFO) << "\tChroma       : "  << getVideoChromaFormatString(vdeo_format->chroma_format);
  LOG(INFO) << "\tBit depth    : "  << vdeo_format->bit_depth_luma_minus8 + 8;
  CUVIDDECODECREATEINFO video_decode_create_info = { 0 };
  video_decode_create_info.CodecType = vdeo_format->codec;
  video_decode_create_info.ChromaFormat = vdeo_format->chroma_format;
  video_decode_create_info.OutputFormat = cudaVideoSurfaceFormat_NV12;
  video_decode_create_info.bitDepthMinus8 = vdeo_format->bit_depth_luma_minus8;
  video_decode_create_info.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
  video_decode_create_info.ulNumOutputSurfaces = 2;
  video_decode_create_info.ulCreationFlags = cudaVideoCreate_PreferCUVID;
  // This is how nvidia recommends calculating ulNumDecodeSurfaces here:
  // https://developer.nvidia.com/blog/optimizing-video-memory-usage-with-the-nvdecode-api-and-nvidia-video-codec-sdk/
  video_decode_create_info.ulNumDecodeSurfaces = vdeo_format->min_num_decode_surfaces + 3;
  video_decode_create_info.ulWidth = vdeo_format->coded_width;
  video_decode_create_info.ulHeight = vdeo_format->coded_height;
  video_decode_create_info.ulMaxWidth = video_decode_create_info.ulWidth;
  video_decode_create_info.ulMaxHeight = video_decode_create_info.ulHeight;
  video_decode_create_info.ulTargetWidth = video_decode_create_info.ulWidth;
  video_decode_create_info.ulTargetHeight = video_decode_create_info.ulHeight;
  m_image_width_in_pixels = vdeo_format->display_area.right - vdeo_format->display_area.left;
  // NV12/P016 output format width is 2 byte aligned because of U and V interleave
  if (m_output_format == cudaVideoSurfaceFormat_NV12 ||
      m_output_format == cudaVideoSurfaceFormat_P016) {
    m_image_width_in_pixels = (m_image_width_in_pixels + 1) & ~1;
  }
  m_luma_height = vdeo_format->display_area.bottom - vdeo_format->display_area.top;
  m_bytes_per_pixel = video_decode_create_info.bitDepthMinus8 > 0 ? 2 : 1;
  m_chroma_height = (int)(std::ceil(m_luma_height * getChromaHeightFactor(video_decode_create_info.OutputFormat)));
  m_num_chroma_planes = getChromaPlaneCount(video_decode_create_info.OutputFormat);
  m_surface_height = video_decode_create_info.ulTargetHeight;
  const int size_ofdecoded_image_yuv_format_in_bytes = m_image_width_in_pixels *
    (m_luma_height + (m_chroma_height * m_num_chroma_planes)) * m_bytes_per_pixel;
  m_decoded_image_yuv.resize(size_ofdecoded_image_yuv_format_in_bytes);
  CUDAContextScope cuda_context_scope(m_cuda_context);
  CUDA_API_CALL(cuvidCreateDecoder(&m_decoder, &video_decode_create_info), THROW_IF_ERROR);
  return video_decode_create_info.ulNumDecodeSurfaces;
 }
 void NvDecoder::decode(const std::vector<uint8_t> & encoded_image_in, const int frame_id,
    std::vector<cgr_libcluster::uchar3> * decoded_image_out_ptr) {
  if(!decoded_image_out_ptr) {
    std::string message = "Pointer to decoded image buffer is null. There is no place for output image, break.";
    LOG(ERROR) << message;
    throw std::invalid_argument(message);
  }
  m_decoded_image_rgb_out_ptr = decoded_image_out_ptr;
  CUVIDSOURCEDATAPACKET packet = { 0 };
  packet.payload = encoded_image_in.data();
  packet.payload_size = encoded_image_in.size();
  packet.flags = CUVID_PKT_ENDOFPICTURE | CUVID_PKT_TIMESTAMP;
  packet.timestamp = 0;
  if (encoded_image_in.size() == 0) {
    packet.flags |= CUVID_PKT_ENDOFSTREAM;
  }
  CUDA_API_CALL(cuvidParseVideoData(m_parser, &packet), THROW_IF_ERROR);
 }
 // 0: fail, >=1: succeeded
 int NvDecoder::HandlePictureDecode(CUVIDPICPARAMS *pic_params) {
  CUDAContextScope cuda_context_scope(m_cuda_context);
  CUDA_API_CALL(cuvidDecodePicture(m_decoder, pic_params), THROW_IF_ERROR);
  return 1;
 }
 //  0: fail; >=1: succeeded
 int NvDecoder::HandlePictureDisplay(CUVIDPARSERDISPINFO *parser_disp_info) {
  CUVIDPROCPARAMS video_processing_parameters = {};
  video_processing_parameters.progressive_frame = parser_disp_info->progressive_frame;
  video_processing_parameters.second_field = parser_disp_info->repeat_first_field + 1;
  video_processing_parameters.top_field_first = parser_disp_info->top_field_first;
  video_processing_parameters.unpaired_field = parser_disp_info->repeat_first_field < 0;
  video_processing_parameters.output_stream = m_cuvid_stream;
  CUdeviceptr src_frame_device_ptr = 0;
  unsigned int src_pitch = 0;
  CUDAContextScope cuda_context_scope(m_cuda_context);
  CUDA_API_CALL(cuvidMapVideoFrame(m_decoder, parser_disp_info->picture_index, &src_frame_device_ptr,
    &src_pitch, &video_processing_parameters), THROW_IF_ERROR);
  CUVIDGETDECODESTATUS decode_status;
  memset(&decode_status, 0, sizeof(decode_status));
  CUresult result = cuvidGetDecodeStatus(m_decoder, parser_disp_info->picture_index, &decode_status);
  if (result == CUDA_SUCCESS &&
      (decode_status.decodeStatus == cuvidDecodeStatus_Error ||
      decode_status.decodeStatus == cuvidDecodeStatus_Error_Concealed)) {
    LOG(INFO) << "Image decoding failed with status: " << decode_status.decodeStatus;
  }
  uint8_t *decoded_image_yuv_ptr = m_decoded_image_yuv.data();
  // Copy luma plane
  CUDA_MEMCPY2D mem_cpy_2d = { 0 };
  mem_cpy_2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
  mem_cpy_2d.srcDevice = src_frame_device_ptr;
  mem_cpy_2d.srcPitch = src_pitch;
  mem_cpy_2d.dstMemoryType = CU_MEMORYTYPE_HOST;
  mem_cpy_2d.dstDevice = (CUdeviceptr)(mem_cpy_2d.dstHost = decoded_image_yuv_ptr);
  mem_cpy_2d.dstPitch = m_device_frame_pitch ? m_device_frame_pitch : m_image_width_in_pixels * m_bytes_per_pixel;
  mem_cpy_2d.WidthInBytes = m_image_width_in_pixels * m_bytes_per_pixel;
  mem_cpy_2d.Height = m_luma_height;
  CUDA_API_CALL(cuMemcpy2DAsync(&mem_cpy_2d, m_cuvid_stream), THROW_IF_ERROR);
  // Copy chroma plane
  // NVDEC output has luma height aligned by 2. Adjust chroma offset by aligning height
  mem_cpy_2d.srcDevice = (CUdeviceptr)((uint8_t *)src_frame_device_ptr + mem_cpy_2d.srcPitch * ((m_surface_height + 1) & ~1));
  mem_cpy_2d.dstDevice = (CUdeviceptr)(mem_cpy_2d.dstHost = decoded_image_yuv_ptr + mem_cpy_2d.dstPitch * m_luma_height);
  mem_cpy_2d.Height = m_chroma_height;
  CUDA_API_CALL(cuMemcpy2DAsync(&mem_cpy_2d, m_cuvid_stream), THROW_IF_ERROR);
  if (m_num_chroma_planes == 2) {
    mem_cpy_2d.srcDevice = (CUdeviceptr)((uint8_t *)src_frame_device_ptr + mem_cpy_2d.srcPitch * ((m_surface_height + 1) & ~1) * 2);
    mem_cpy_2d.dstDevice = (CUdeviceptr)(mem_cpy_2d.dstHost = decoded_image_yuv_ptr + mem_cpy_2d.dstPitch * m_luma_height * 2);
    mem_cpy_2d.Height = m_chroma_height;
    CUDA_API_CALL(cuMemcpy2DAsync(&mem_cpy_2d, m_cuvid_stream), THROW_IF_ERROR);
  }
  CUDA_API_CALL(cuStreamSynchronize(m_cuvid_stream), THROW_IF_ERROR);
  CUDA_API_CALL(cuvidUnmapVideoFrame(m_decoder, src_frame_device_ptr), THROW_IF_ERROR);
  const int num_pixels = m_image_width_in_pixels * m_luma_height;
  if(m_decoded_image_rgb_out_ptr->size() != num_pixels) {
    m_decoded_image_rgb_out_ptr->resize(num_pixels);
  }
  yuv2Rgb(decoded_image_yuv_ptr, m_image_width_in_pixels, m_luma_height, m_decoded_image_rgb_out_ptr);
  return 1;
 }
 }
--- a/intern/cycles/cluster_rendering/libcluster/compression/nv_decoder.h
+++ b/intern/cycles/cluster_rendering/libcluster/compression/nv_decoder.h
@ -0,0 +1,81 @@
 #ifndef __NV_DECODER_H__
 #define __NV_DECODER_H__
 #ifdef WITH_CUDA_DYNLOAD
  #include <cuew.h>
  // Do not use CUDA SDK headers when using CUEW
  // The macro below is used by Optix SDK and is necessary to avoid DSO loading collision
  // See device_optix.cpp for example.
  #define OPTIX_DONT_INCLUDE_CUDA
 #else
  #include <cuda.h>
 #endif
 #include "nvcuvid.h"
 #include "../utils/logging.h"
 #include "../utils/vector_types.h"    // for uchar3
 namespace cgr_libcluster {
 class NvDecoder {
 public:
  NvDecoder(CUcontext cuda_context);
  ~NvDecoder();
  // Decodes input image, converts into rgb format and puts into decoded_image_out_ptr
  // Decoder resizes output vector so it can accommodate a decoded image if needed
  void decode(const std::vector<uint8_t> & encoded_image_in, const int frame_id,
    std::vector<cgr_libcluster::uchar3> * decoded_image_out_ptr);
 private:
  const CUcontext m_cuda_context;
  CUvideoparser m_parser = nullptr;
  CUvideodecoder m_decoder = nullptr;
  CUstream m_cuvid_stream = 0;
  unsigned int m_image_width_in_pixels = 0;
  unsigned int m_luma_height = 0;
  unsigned int m_chroma_height = 0;
  unsigned int m_num_chroma_planes = 0;
  int m_bytes_per_pixel = 1;
  size_t m_device_frame_pitch = 0;
  int m_surface_height = 0;
  cudaVideoSurfaceFormat m_output_format = cudaVideoSurfaceFormat_NV12;
  std::vector<uint8_t> m_decoded_image_yuv;
  std::vector<cgr_libcluster::uchar3> * m_decoded_image_rgb_out_ptr = nullptr;
  // Callback function to be registered for getting a callback when decoding of sequence starts
  static int CUDAAPI HandleVideoSequenceCallback(void *pUserData, CUVIDEOFORMAT *pVideoFormat) {
    return ((NvDecoder *)pUserData)->HandleVideoSequence(pVideoFormat);
  }
  // Callback function to be registered for getting a callback when a decoded frame is ready to be decoded
  static int CUDAAPI HandlePictureDecodeCallback(void *pUserData, CUVIDPICPARAMS *pPicParams) {
    return ((NvDecoder *)pUserData)->HandlePictureDecode(pPicParams);
  }
  // Callback function to be registered for getting a callback when a decoded frame is available for display
  static int CUDAAPI HandlePictureDisplayCallback(void *pUserData, CUVIDPARSERDISPINFO *pDispInfo) {
    return ((NvDecoder *)pUserData)->HandlePictureDisplay(pDispInfo);
  }
  // This function gets called when a sequence is ready to be decoded. The function also gets called
  // when there is format change. It inits video decoder
  int HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat);
  // This function gets called when a picture is ready to be decoded. cuvidDecodePicture is called from
  // this function to decode the picture
  int HandlePictureDecode(CUVIDPICPARAMS *pPicParams);
  // This function gets called after a picture is decoded and available for display. Frames are fetched
  // and stored in internal buffer
  int HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo);
 };
 }
 #endif
--- a/intern/cycles/cluster_rendering/libcluster/compression/nv_encoder.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/compression/nv_encoder.cpp
@ -6,6 +6,7 @@
 #include <glog/logging.h>
 #include "nv_encoder.h"
 #include "../cuda_context_provider.h"
 namespace cgr_libcluster {
@ -24,22 +25,56 @@ NvEncoder::NvEncoder(NV_ENC_BUFFER_FORMAT buffer_format,
 }
 NvEncoder::~NvEncoder() {
  VLOG(3) << "NvEncoder destructor";
  if(m_encoder_session_handle.get()) {
-    nvencode_api_function_list.nvEncDestroyBitstreamBuffer(m_encoder_session_handle.get(), m_bitstream_output_buffer.get());
+    NV_ENC_PIC_PARAMS end_of_input_stream_pic_arams = {NV_ENC_PIC_PARAMS_VER};
-    nvencode_api_function_list.nvEncUnregisterResource(m_encoder_session_handle.get(), m_input_buffer_registration.get());
+    end_of_input_stream_pic_arams.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
-    nvencode_api_function_list.nvEncDestroyEncoder(m_encoder_session_handle.get());
+    NVENCSTATUS nv_error_code = m_nvencode_api_function_list.nvEncEncodePicture(
      m_encoder_session_handle.get(), &end_of_input_stream_pic_arams);
    if(nv_error_code != NV_ENC_SUCCESS) {
      std::string message = "FATAL. nvEncEncodePicture call with end of stream failed with error: "
        + std::to_string(nv_error_code);
      VLOG(1) << message;
    } else {
      VLOG(3) << "Communicated end of stream to the NvEncoder";
    }
    nv_error_code = m_nvencode_api_function_list.nvEncUnregisterResource(m_encoder_session_handle.get(),
      m_input_buffer_registration.get());
    if(nv_error_code != NV_ENC_SUCCESS) {
      std::string message = "FATAL. NvEncoder input biffer un-registration failed with error: "
        + std::to_string(nv_error_code);
      VLOG(1) << message;
    } else {
      VLOG(3) << "Unregistered NvEncoder input biffer successfully";
    }
    nv_error_code = m_nvencode_api_function_list.nvEncDestroyBitstreamBuffer(m_encoder_session_handle.get(),
      m_bitstream_output_buffer.get());
    if(nv_error_code != NV_ENC_SUCCESS) {
      std::string message = "FATAL. nvEncDestroyBitstreamBuffer failed with error: "
        + std::to_string(nv_error_code);
      VLOG(1) << message;
    } else {
      VLOG(3) << "Destroyed NvEncoder bitstream buffer successfully";
    }
    m_nvencode_api_function_list.nvEncDestroyEncoder(m_encoder_session_handle.get());
    VLOG(3) << "Destroyed NvEncoder session successfully";
  }
  if(m_cuda_context) {
-    cuCtxPushCurrent(m_cuda_context);
+    CUDAContextScope cuda_context_scope(m_cuda_context);
    cuMemFree(m_cuda_device_ptr.get());
-    cuCtxPopCurrent(NULL);
+    VLOG(3) << "Released cuda input buffer successfully";
  }
  VLOG(3) << "NvEncoder released resources and is destroyed";
 }
 void NvEncoder::initNvEncoder() {
  // most of values that are currently set here are taken from the NVIDIA samples NvEncoder.cpp
  // we can adjust them later on if needed
  VLOG(3) << "Initializing NVENCODER for width: " << m_width << " height: " << m_height;
  uint32_t driver_version = 0;
  uint32_t header_version = (NVENCAPI_MAJOR_VERSION << 4) | NVENCAPI_MINOR_VERSION;
  NVENCSTATUS nv_error_code_v = NvEncodeAPIGetMaxSupportedVersion(&driver_version);
@ -56,15 +91,15 @@ void NvEncoder::initNvEncoder() {
    VLOG(1) << message;
    throw std::runtime_error(message);
  }
-  nvencode_api_function_list = { NV_ENCODE_API_FUNCTION_LIST_VER };
+  m_nvencode_api_function_list = { NV_ENCODE_API_FUNCTION_LIST_VER };
-  NVENCSTATUS nv_error_code = NvEncodeAPICreateInstance(&nvencode_api_function_list);
+  NVENCSTATUS nv_error_code = NvEncodeAPICreateInstance(&m_nvencode_api_function_list);
  if(nv_error_code != NV_ENC_SUCCESS) {
    std::string message = "FATAL. NvEncodeAPICreateInstance failed with error: "
      + std::to_string(nv_error_code);
    VLOG(1) << message;
    throw std::runtime_error(message);
  }
-  if(!nvencode_api_function_list.nvEncOpenEncodeSessionEx) {
+  if(!m_nvencode_api_function_list.nvEncOpenEncodeSessionEx) {
    std::string message = "FATAL. nvEncOpenEncodeSessionEx API not found";
    VLOG(1) << message;
    throw std::runtime_error(message);
@ -74,13 +109,13 @@ void NvEncoder::initNvEncoder() {
  encode_session_ex_params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
  encode_session_ex_params.apiVersion = NVENCAPI_VERSION;
  void * encoder_session_handle;
-  nv_error_code = nvencode_api_function_list.nvEncOpenEncodeSessionEx(&encode_session_ex_params, &encoder_session_handle);
+  nv_error_code = m_nvencode_api_function_list.nvEncOpenEncodeSessionEx(&encode_session_ex_params, &encoder_session_handle);
  m_encoder_session_handle.init(encoder_session_handle, __FILE__, __FUNCTION__, __LINE__);
  if(nv_error_code != NV_ENC_SUCCESS) {
    std::string message = "FATAL. nvEncOpenEncodeSessionEx failed with error: "
      + std::to_string(nv_error_code);
    VLOG(1) << message;
-    nvencode_api_function_list.nvEncDestroyEncoder(m_encoder_session_handle.get());
+    m_nvencode_api_function_list.nvEncDestroyEncoder(m_encoder_session_handle.get());
    throw std::runtime_error(message);
  }
  if(!m_encoder_session_handle.get()) {
@ -92,66 +127,66 @@ void NvEncoder::initNvEncoder() {
  // extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvEncodeAPI.h
  // github link:
  // https://ghe.oculus-rep.com/FRL-Graphics-Research/distributed_blender_cycles/blob/cluster_blender_32_main_nvenc/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvEncodeAPI.h#L1667
-  NV_ENC_INITIALIZE_PARAMS encoder_init_params = { 0 };
+  m_encoder_init_params = { 0 };
-  encoder_init_params.version = NV_ENC_INITIALIZE_PARAMS_VER;
+  m_encoder_init_params.version = NV_ENC_INITIALIZE_PARAMS_VER;
-  GUID preset = NV_ENC_PRESET_P1_GUID;
+  GUID preset_guid = NV_ENC_PRESET_P1_GUID;
-  encoder_init_params.encodeGUID = NV_ENC_CODEC_H264_GUID;
+  m_encoder_init_params.encodeGUID = NV_ENC_CODEC_H264_GUID;
-  encoder_init_params.presetGUID = preset;
+  m_encoder_init_params.presetGUID = preset_guid;
-  encoder_init_params.encodeWidth = m_width;
+  m_encoder_init_params.encodeWidth = m_width;
-  encoder_init_params.encodeHeight = m_height;
+  m_encoder_init_params.encodeHeight = m_height;
-  encoder_init_params.darWidth = m_width;
+  m_encoder_init_params.darWidth = m_width;
-  encoder_init_params.darHeight = m_height;
+  m_encoder_init_params.darHeight = m_height;
-  encoder_init_params.frameRateNum = 30;
+  m_encoder_init_params.frameRateNum = 30;
-  encoder_init_params.frameRateDen = 1;
+  m_encoder_init_params.frameRateDen = 1;
-  encoder_init_params.enablePTD = 1;
+  m_encoder_init_params.enablePTD = 1;
-  encoder_init_params.reportSliceOffsets = 0;
+  m_encoder_init_params.reportSliceOffsets = 0;
-  encoder_init_params.enableSubFrameWrite = 0;
+  m_encoder_init_params.enableSubFrameWrite = 0;
-  encoder_init_params.maxEncodeWidth = m_width;
+  m_encoder_init_params.maxEncodeWidth = m_width;
-  encoder_init_params.maxEncodeHeight = m_height;
+  m_encoder_init_params.maxEncodeHeight = m_height;
-  encoder_init_params.enableMEOnlyMode = false;
+  m_encoder_init_params.enableMEOnlyMode = false;
-  encoder_init_params.enableOutputInVidmem = false;
+  m_encoder_init_params.enableOutputInVidmem = false;
-  encoder_init_params.tuningInfo = NV_ENC_TUNING_INFO_LOW_LATENCY;
+  m_encoder_init_params.tuningInfo = NV_ENC_TUNING_INFO_LOW_LATENCY;
-  //encoder_init_params.tuningInfo = NV_ENC_TUNING_INFO_HIGH_QUALITY;
+  //m_encoder_init_params.tuningInfo = NV_ENC_TUNING_INFO_HIGH_QUALITY;
-  NV_ENC_CONFIG encode_config = { 0 };
+  m_encode_config = { 0 };
-  encoder_init_params.encodeConfig = &encode_config;
+  m_encoder_init_params.encodeConfig = &m_encode_config;
-  NV_ENC_PRESET_CONFIG presetConfig_2 = { NV_ENC_PRESET_CONFIG_VER, { NV_ENC_CONFIG_VER } };
+  NV_ENC_PRESET_CONFIG preset_config_ex = { NV_ENC_PRESET_CONFIG_VER, { NV_ENC_CONFIG_VER } };
-  nvencode_api_function_list.nvEncGetEncodePresetConfigEx(m_encoder_session_handle.get(),
+  m_nvencode_api_function_list.nvEncGetEncodePresetConfigEx(m_encoder_session_handle.get(),
-    NV_ENC_CODEC_H264_GUID, preset, encoder_init_params.tuningInfo, &presetConfig_2);
+    m_encoder_init_params.encodeGUID, preset_guid, m_encoder_init_params.tuningInfo, &preset_config_ex);
-  memcpy(encoder_init_params.encodeConfig, &presetConfig_2.presetCfg, sizeof(NV_ENC_CONFIG));
+  memcpy(m_encoder_init_params.encodeConfig, &preset_config_ex.presetCfg, sizeof(NV_ENC_CONFIG));
-  encode_config.version = NV_ENC_CONFIG_VER;
+  m_encode_config.frameIntervalP = 1;
-  encode_config.frameIntervalP = 1;
+  m_encode_config.gopLength = NVENC_INFINITE_GOPLENGTH;
-  encode_config.gopLength = NVENC_INFINITE_GOPLENGTH;
+  m_encode_config.encodeCodecConfig.h264Config.idrPeriod = m_encode_config.gopLength;
-  encode_config.encodeCodecConfig.h264Config.idrPeriod = encode_config.gopLength;
+  m_encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CBR;
-  encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CBR;
+  m_encode_config.rcParams.multiPass = NV_ENC_TWO_PASS_FULL_RESOLUTION;
  encode_config.rcParams.multiPass = NV_ENC_TWO_PASS_FULL_RESOLUTION;
  // This produces images with acceptable quality. Tested on Tesla and Trudy scenes on flat screen.
  // We may adjust the value of the bitrate based of results of further testing.
-  encode_config.rcParams.averageBitRate = 2800000;
+  m_encode_config.rcParams.averageBitRate = 2800000;
  // Below is bitrate calculation from the NVIDIA samples app. It's too low for us.
  // Keep it here for now for reference. Remove once bitrate calculation is finalised.
-  // encode_config.rcParams.averageBitRate = (static_cast<unsigned int>(
+  // m_encode_config.rcParams.averageBitRate = (static_cast<unsigned int>(
-  //   5.0f * encoder_init_params.encodeWidth * encoder_init_params.encodeHeight) /
+  //   5.0f * m_encoder_init_params.encodeWidth * m_encoder_init_params.encodeHeight) /
  //   (width * height)) * 100000;
-  VLOG(3) << "Average bitrate: " << encode_config.rcParams.averageBitRate;
+  VLOG(3) << "Average bitrate: " << m_encode_config.rcParams.averageBitRate;
-  encode_config.rcParams.vbvBufferSize = (encode_config.rcParams.averageBitRate * encoder_init_params.frameRateDen /
+  m_encode_config.rcParams.vbvBufferSize = (m_encode_config.rcParams.averageBitRate * m_encoder_init_params.frameRateDen /
-    encoder_init_params.frameRateNum) * 5;
+    m_encoder_init_params.frameRateNum) * 5;
-  encode_config.rcParams.maxBitRate = encode_config.rcParams.averageBitRate * 2;
+  m_encode_config.rcParams.maxBitRate = m_encode_config.rcParams.averageBitRate * 2;
-  encode_config.rcParams.vbvInitialDelay = encode_config.rcParams.vbvBufferSize;
+  m_encode_config.rcParams.vbvInitialDelay = m_encode_config.rcParams.vbvBufferSize;
-  nv_error_code = nvencode_api_function_list.nvEncInitializeEncoder(m_encoder_session_handle.get(), &encoder_init_params);
+  nv_error_code = m_nvencode_api_function_list.nvEncInitializeEncoder(m_encoder_session_handle.get(), &m_encoder_init_params);
  if(nv_error_code != NV_ENC_SUCCESS) {
    std::string message = "FATAL. nvEncInitializeEncoder failed with error: "
      + std::to_string(nv_error_code);
    VLOG(1) << message;
    throw std::runtime_error(message);
  }
  VLOG(3) << "NvEncoder initialization completed";
 }
 void NvEncoder::allocateOutputBuffer() {
  NV_ENC_CREATE_BITSTREAM_BUFFER create_bitstream_buffer_struct = { NV_ENC_CREATE_BITSTREAM_BUFFER_VER };
-  NVENCSTATUS nv_error_code = nvencode_api_function_list.nvEncCreateBitstreamBuffer(m_encoder_session_handle.get(),
+  NVENCSTATUS nv_error_code = m_nvencode_api_function_list.nvEncCreateBitstreamBuffer(m_encoder_session_handle.get(),
    &create_bitstream_buffer_struct);
  if(nv_error_code != NV_ENC_SUCCESS) {
    std::string message = "FATAL. nvEncCreateBitstreamBuffer failed with error: "
@ -165,14 +200,7 @@ void NvEncoder::allocateOutputBuffer() {
 }
 void NvEncoder::allocateInputBuffer() {
-  CUresult cu_result = cuCtxPushCurrent(m_cuda_context);
+  CUDAContextScope cuda_context_scope(m_cuda_context);
  if(cu_result != CUDA_SUCCESS) {
    const char *error_name = NULL;
    cuGetErrorName(cu_result, &error_name);
    std::string message = "FATAL. cuCtxPushCurrent failed with error: " + std::string(error_name);
    VLOG(1) << message;
    throw std::runtime_error(message);
  }
  const int chroma_height = getChromaHeight(m_buffer_format, m_height);
  const int height_in_rows = m_height + chroma_height;
  const int width_in_bytes = getWidthInBytes(m_buffer_format, m_width);
@ -180,7 +208,7 @@ void NvEncoder::allocateInputBuffer() {
  CUdeviceptr cuda_device_ptr;
  VLOG(3) << "Allocating input buffer with cuMemAllocPitch cuda_pitch: " << m_cuda_pitch.get() << " width_in_bytes: " <<
    width_in_bytes << " height in rows: " << height_in_rows;
-  cu_result = cuMemAllocPitch((CUdeviceptr *)&cuda_device_ptr,
+  CUresult cu_result = cuMemAllocPitch((CUdeviceptr *)&cuda_device_ptr,
    &cuda_pitch,
    width_in_bytes,
    height_in_rows,
@ -195,14 +223,6 @@ void NvEncoder::allocateInputBuffer() {
  m_cuda_pitch.init(cuda_pitch, __FILE__, __FUNCTION__, __LINE__);
  m_cuda_device_ptr.init(cuda_device_ptr, __FILE__, __FUNCTION__, __LINE__);
  VLOG(3) << "Successfully allocated input buffer with cuMemAllocPitch";
  cu_result = cuCtxPopCurrent(NULL);
  if(cu_result != CUDA_SUCCESS) {
    const char *error_name = NULL;
    cuGetErrorName(cu_result, &error_name);
    std::string message = "FATAL. cuCtxPopCurrent failed with error: " + std::string(error_name);
    VLOG(1) << message;
    throw std::runtime_error(message);
  }
  VLOG(3) << "Input CUDA buffer allocation completed";
  registerInputResources(cuda_device_ptr);
 }
@ -221,7 +241,7 @@ void NvEncoder::registerInputResources(CUdeviceptr cuda_device_ptr) {
  register_resource_struct.pOutputFencePoint = nullptr;
  VLOG(3) << "nvEncRegisterResource with width: " << register_resource_struct.width <<
    " height: " << register_resource_struct.height << " pitch: " << register_resource_struct.pitch;
-  NVENCSTATUS nv_error_code = nvencode_api_function_list.nvEncRegisterResource(m_encoder_session_handle.get(),
+  NVENCSTATUS nv_error_code = m_nvencode_api_function_list.nvEncRegisterResource(m_encoder_session_handle.get(),
    &register_resource_struct);
  if(nv_error_code != NV_ENC_SUCCESS) {
    std::string message = "FATAL. nvEncRegisterResource failed with error: "
@ -245,41 +265,56 @@ void NvEncoder::registerInputResources(CUdeviceptr cuda_device_ptr) {
  VLOG(3) << "Successfully registered input buffer resource";
 }
-bool NvEncoder::getEncodedBuffer(NV_ENC_MAP_INPUT_RESOURCE &map_input_resource, AutoEnlargingBuffer<uint8_t> &encoded_buffer_out,
+bool NvEncoder::getEncodedBuffer(NV_ENC_MAP_INPUT_RESOURCE &map_input_resource, std::vector<uint8_t> &encoded_buffer_out,
    high_resolution_clock::time_point & encoding_done) const {
  NV_ENC_LOCK_BITSTREAM lock_bitstream_data = { NV_ENC_LOCK_BITSTREAM_VER };
  lock_bitstream_data.outputBitstream = m_bitstream_output_buffer.get();
  lock_bitstream_data.doNotWait = false;
-  NVENCSTATUS nv_error_code = nvencode_api_function_list.nvEncLockBitstream(m_encoder_session_handle.get(), &lock_bitstream_data);
+  NVENCSTATUS nv_error_code = m_nvencode_api_function_list.nvEncLockBitstream(m_encoder_session_handle.get(), &lock_bitstream_data);
  if(nv_error_code != NV_ENC_SUCCESS) {
    VLOG(1) << "ERROR. nvEncLockBitstream failed with error: " << nv_error_code;
    return false;
  }
  encoding_done = high_resolution_clock::now();
  uint8_t * encoded_data_ptr = (uint8_t *)lock_bitstream_data.bitstreamBufferPtr;
-  encoded_buffer_out.insert(&encoded_data_ptr[0], lock_bitstream_data.bitstreamSizeInBytes);
+  encoded_buffer_out.assign(encoded_data_ptr, encoded_data_ptr + lock_bitstream_data.bitstreamSizeInBytes);
-  nv_error_code = nvencode_api_function_list.nvEncUnlockBitstream(m_encoder_session_handle.get(), lock_bitstream_data.outputBitstream);
+  nv_error_code = m_nvencode_api_function_list.nvEncUnlockBitstream(m_encoder_session_handle.get(), lock_bitstream_data.outputBitstream);
  if(nv_error_code != NV_ENC_SUCCESS) {
    VLOG(1) << "ERROR. nvEncUnlockBitstream failed with error: " << nv_error_code;
    return false;
  }
-  nv_error_code = nvencode_api_function_list.nvEncUnmapInputResource(m_encoder_session_handle.get(), map_input_resource.mappedResource);
+  nv_error_code = m_nvencode_api_function_list.nvEncUnmapInputResource(m_encoder_session_handle.get(), map_input_resource.mappedResource);
  if(nv_error_code != NV_ENC_SUCCESS) {
    VLOG(1) << "ERROR. nvEncUnmapInputResource failed with error: " << nv_error_code;
    return false;
  }
  std::chrono::duration<float> copy_to_cpu_time_sec = high_resolution_clock::now() - encoding_done;
-  VLOG(3) << "Time to copy encoded image buffer to CPU memory: " << copy_to_cpu_time_sec.count() << " sec";
+  VLOG(3) << "Time to copy encoded image buffer to CPU memory: " << copy_to_cpu_time_sec.count() <<
    " sec. Size: " << encoded_buffer_out.size();
  return true;
 }
-bool NvEncoder::encode(const uint8_t* input_buffer_on_host, AutoEnlargingBuffer<uint8_t> &encoded_buffer_out) const {
+// Do explicit template instantiation for encode() and
 // copyInputBufferIntoGpuMappedMemory() methods (see below) for the follow reasons:
 // we only support buffers in the CPU/host memory as uint8_t* and buffers in the GPU/device memory as CUdeviceptr
 // So instantiate explicitly only for these types. This also allows us to have template
 // methods implementation in the .cpp file instead of .h to keep header clean.
 // Explicit template instantiation of the encode for the input buffers in cpu memory as uint8_t*
 template bool NvEncoder::encode(const uint8_t*, std::vector<uint8_t> &) const;
 // Explicit template instantiation of the encode for the input buffers in gpu memory as CUdeviceptr
 template bool NvEncoder::encode(const CUdeviceptr, std::vector<uint8_t> &) const;
 template<typename T>
 bool NvEncoder::encode(const T input_buffer, std::vector<uint8_t> &encoded_buffer_out) const {
  CUDAContextScope cuda_context_scope(m_cuda_context);
  if(m_encoder_session_handle.get() == nullptr) {
    VLOG(1) <<  "ERROR. encoder_session_handle is null. Encoder is not initialised or initialization failed.";
    return false;
  }
  NV_ENC_MAP_INPUT_RESOURCE map_input_resource = { NV_ENC_MAP_INPUT_RESOURCE_VER };
-  if(!copyInputBufferToGpu(input_buffer_on_host, map_input_resource)) {
+  if(!copyInputBufferIntoGpuMappedMemory(input_buffer, map_input_resource)) {
    return false;
  }
  auto start_encoding = high_resolution_clock::now();    
@ -305,24 +340,31 @@ bool NvEncoder::encode(const uint8_t* input_buffer_on_host, AutoEnlargingBuffer<
 }
 const NVENCSTATUS NvEncoder::startEncoding(NV_ENC_MAP_INPUT_RESOURCE & map_input_resource) const {
-  NV_ENC_PIC_PARAMS picParams = {};
+  NV_ENC_PIC_PARAMS pic_params = {};
-  picParams.version = NV_ENC_PIC_PARAMS_VER;
+  pic_params.version = NV_ENC_PIC_PARAMS_VER;
-  picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
+  pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
-  picParams.inputBuffer = map_input_resource.mappedResource;
+  pic_params.inputBuffer = map_input_resource.mappedResource;
-  picParams.bufferFmt = m_buffer_format;
+  pic_params.bufferFmt = m_buffer_format;
-  picParams.inputWidth = m_width;
+  pic_params.inputWidth = m_width;
-  picParams.inputHeight = m_height;
+  pic_params.inputHeight = m_height;
-  picParams.outputBitstream = m_bitstream_output_buffer.get();
+  pic_params.outputBitstream = m_bitstream_output_buffer.get();
-  picParams.completionEvent = nullptr;
+  pic_params.completionEvent = nullptr;
-  return nvencode_api_function_list.nvEncEncodePicture(m_encoder_session_handle.get(), &picParams);
+  return m_nvencode_api_function_list.nvEncEncodePicture(m_encoder_session_handle.get(), &pic_params);
 }
-bool NvEncoder::copyInputBufferToGpu(const uint8_t* input_buffer_on_host, NV_ENC_MAP_INPUT_RESOURCE &map_input_resource) const {
+// Explicit template instantiation of the copyInputBufferIntoGpuMappedMemory for the input buffers in cpu memory as uint8_t*
 template bool NvEncoder::copyInputBufferIntoGpuMappedMemory(const uint8_t*, NV_ENC_MAP_INPUT_RESOURCE &) const;
 // Explicit template instantiation of the copyInputBufferIntoGpuMappedMemory for the input buffers in gpu memory as CUdeviceptr
 template bool NvEncoder::copyInputBufferIntoGpuMappedMemory(const CUdeviceptr, NV_ENC_MAP_INPUT_RESOURCE &) const;
 template<typename T>
 bool NvEncoder::copyInputBufferIntoGpuMappedMemory(T input_buffer,
    NV_ENC_MAP_INPUT_RESOURCE &map_input_resource) const {
  auto start_copy_to_gpu = high_resolution_clock::now();
  const int width_in_bytes = getWidthInBytes(m_buffer_format, m_width);
  CUDA_MEMCPY2D memcpy2d = { 0 };
-  memcpy2d.srcMemoryType = CU_MEMORYTYPE_HOST;
+  setSourceBuffer(input_buffer, memcpy2d);
  memcpy2d.srcHost = input_buffer_on_host;
  memcpy2d.srcPitch = getWidthInBytes(m_buffer_format, m_width);
  memcpy2d.dstMemoryType = CU_MEMORYTYPE_DEVICE;
  memcpy2d.dstDevice = m_cuda_device_ptr.get();
@ -331,7 +373,7 @@ bool NvEncoder::copyInputBufferToGpu(const uint8_t* input_buffer_on_host, NV_ENC
  memcpy2d.Height = m_height;
  cuMemcpy2D(&memcpy2d);
  for (int i = 0; i < m_src_chroma_offsets.get().size(); ++i) {
-    memcpy2d.srcHost = (input_buffer_on_host + m_src_chroma_offsets.get()[i]);
+    setSourceBuffer(input_buffer + m_src_chroma_offsets.get()[i], memcpy2d);
    memcpy2d.dstDevice = (CUdeviceptr)((uint8_t *)m_cuda_device_ptr.get() + m_dst_chroma_offsets.get()[i]);
    memcpy2d.srcPitch = getChromaPitch(m_buffer_format, memcpy2d.srcPitch);
    memcpy2d.dstPitch = getChromaPitch(m_buffer_format, memcpy2d.dstPitch);
@ -340,7 +382,7 @@ bool NvEncoder::copyInputBufferToGpu(const uint8_t* input_buffer_on_host, NV_ENC
    cuMemcpy2D(&memcpy2d);
  }
  map_input_resource.registeredResource = m_input_buffer_registration.get();
-  NVENCSTATUS nv_error_code = nvencode_api_function_list.nvEncMapInputResource(m_encoder_session_handle.get(), &map_input_resource);
+  NVENCSTATUS nv_error_code = m_nvencode_api_function_list.nvEncMapInputResource(m_encoder_session_handle.get(), &map_input_resource);
  if(nv_error_code != NV_ENC_SUCCESS) {
    VLOG(3) << "ERROR. nvEncMapInputResource failed with error: " << std::to_string(nv_error_code);
    return false;
@ -350,6 +392,16 @@ bool NvEncoder::copyInputBufferToGpu(const uint8_t* input_buffer_on_host, NV_ENC
  return true;
 }
 void NvEncoder::setSourceBuffer(const uint8_t * input_buffer, CUDA_MEMCPY2D &memcpy2d) const {
  memcpy2d.srcHost = input_buffer;
  memcpy2d.srcMemoryType = CU_MEMORYTYPE_HOST;
 }
 void NvEncoder::setSourceBuffer(const CUdeviceptr input_buffer, CUDA_MEMCPY2D &memcpy2d) const {
  memcpy2d.srcDevice = input_buffer;
  memcpy2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
 }
 uint32_t NvEncoder::getWidthInBytes(const NV_ENC_BUFFER_FORMAT buffer_format, const uint32_t width) const {
  switch (buffer_format) {
    case NV_ENC_BUFFER_FORMAT_NV12:
--- a/intern/cycles/cluster_rendering/libcluster/compression/nv_encoder.h
+++ b/intern/cycles/cluster_rendering/libcluster/compression/nv_encoder.h
@ -1,76 +1,36 @@
 #ifndef __NV_ENCODER_H__
 #define __NV_ENCODER_H__
 #include <chrono>
 #include <vector>
-//#include <cuda.h>
+
-#  ifdef WITH_CUDA_DYNLOAD
+#ifdef WITH_CUDA_DYNLOAD
-#    include "cuew.h"
+  #include <cuew.h>
-#  else
+  // Do not use CUDA SDK headers when using CUEW
-#    include <cuda.h>
+  // The macro below is used by Optix SDK and is necessary to avoid DSO loading collision
-#    include <cudaGL.h>
+  // See device_optix.cpp for example.
-#  endif
+  #define OPTIX_DONT_INCLUDE_CUDA
 #else
  #include <cuda.h>
 #endif
 #include "final.h"
 #include "nvEncodeAPI.h"
 #include "../utils/logging.h"
 namespace cgr_libcluster {
 using namespace std::chrono;
 // This class holds preallocated buffer with requested initial size.
 // It enlarges the buffer automatically if it cannot accomodate incoming data
 // Buffer does not shrink, it only enlarges when needed to reduce amount of memory re-allocations.
 template <typename T>
 class AutoEnlargingBuffer {
 public:
  AutoEnlargingBuffer(size_t init_size){
    if(init_size > 0) {
      resize(init_size);
    }
  }
  ~AutoEnlargingBuffer(){
    delete [] data_buffer;
  }
  void insert(const T* input_data, const size_t input_data_size) {
    if(input_data_size > m_capacity) {
      VLOG(3) << "Current capacity of the AutoEnlargingBuffer: " << m_capacity <<
        " Reallocating to support larger data size: " << input_data_size;
      resize(input_data_size);
    }
    memcpy(data_buffer, input_data, input_data_size);
    m_data_size = input_data_size;
  }
  T* data() const {
    return data_buffer;
  }
  size_t size() const {
    return m_data_size;
  }
 private:
  T* data_buffer = nullptr;
  size_t m_data_size = 0;
  size_t m_capacity = 0;
  void resize(const size_t new_buffer_size) {
    delete [] data_buffer;
    data_buffer = new T[new_buffer_size];
    m_capacity = new_buffer_size;
    m_data_size = 0;
  }
 };
 class NvEncoder {
 public:
  static const NV_ENC_BUFFER_FORMAT BUFFER_FORMAT_FOR_IMAGES_FROM_MASTER = NV_ENC_BUFFER_FORMAT_ABGR;
  NvEncoder(NV_ENC_BUFFER_FORMAT buffer_format, CUcontext cuda_context, const int width, const int height);
  ~NvEncoder();
-  bool encode(const uint8_t* input_buffer_on_host, AutoEnlargingBuffer<uint8_t> &encoded_buffer_out) const;
+
  // Encodes an image within the input_buffer
  template<typename T>
  bool encode(const T input_buffer, std::vector<uint8_t> &encoded_buffer_out) const;
 private:
  static constexpr int DEVICE_NUM = 0;
@ -80,7 +40,9 @@ private:
  const int m_width;
  const int m_height;
  const NV_ENC_BUFFER_FORMAT m_buffer_format;
-  NV_ENCODE_API_FUNCTION_LIST nvencode_api_function_list;
+  NV_ENC_INITIALIZE_PARAMS m_encoder_init_params;
  NV_ENC_CONFIG m_encode_config;
  NV_ENCODE_API_FUNCTION_LIST m_nvencode_api_function_list;
  Final<void *> m_encoder_session_handle;
  Final<size_t> m_cuda_pitch;
  Final<CUdeviceptr> m_cuda_device_ptr;
@ -111,11 +73,17 @@ private:
  // Registers preallocated input CUDA buffer for the NvEncoder with nvEncRegisterResource
  void registerInputResources(CUdeviceptr cuda_device_ptr);
-  // Copies input image buffer (on CPU) to the preallocated and registered encoder input buffer on GPU (CUDA device)
+  // Copies input image buffer (on CPU or GPU memory) to the preallocated and registered
-  bool copyInputBufferToGpu(const uint8_t* input_buffer_on_host, NV_ENC_MAP_INPUT_RESOURCE &map_input_resource) const;
+  // encoder input buffer on GPU (CUDA device)
  template<typename T>
  bool copyInputBufferIntoGpuMappedMemory(T input_buffer, NV_ENC_MAP_INPUT_RESOURCE &map_input_resource) const;
  void setSourceBuffer(const uint8_t * input_buffer, CUDA_MEMCPY2D &memcpy2d) const;
  void setSourceBuffer(const CUdeviceptr input_buffer, CUDA_MEMCPY2D &memcpy2d) const;
  // Copies encoded image from the encoder to the preallocated output buffer on CPU
-  bool getEncodedBuffer(NV_ENC_MAP_INPUT_RESOURCE &map_input_resource, AutoEnlargingBuffer<uint8_t> &encoded_buffer_out,
+  bool getEncodedBuffer(NV_ENC_MAP_INPUT_RESOURCE &map_input_resource,
      std::vector<uint8_t> &encoded_buffer_out,
      high_resolution_clock::time_point & encoding_done) const;
  // Starts image encoding
--- a/intern/cycles/cluster_rendering/libcluster/compression/turbojpeg_compressor.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/compression/turbojpeg_compressor.cpp
@ -0,0 +1,124 @@
 //#include "./utils/timer.h"    // for scoped_timer
 #include "../utils/logging.h"
 #include "turbojpeg_compressor.h"
 namespace cgr_libcluster {
 TurbojpegCompressor::TurbojpegCompressor() {
  m_jpeg_compressor = tjInitCompress();
  m_jpeg_decompressor = tjInitDecompress();
 }
 TurbojpegCompressor::~TurbojpegCompressor() {
  if (m_jpeg_compressor != nullptr) {
    tjDestroy(m_jpeg_compressor);
  }
  if (m_jpeg_decompressor != nullptr) {
    tjDestroy(m_jpeg_decompressor);
  }
 }
 //#define TIME_JPEG
 size_t TurbojpegCompressor::compress(void* src_buffer, int width, int height, int compression_quality,
    unsigned char*& jpeg_image) {
    // Convert buffer to unsigned char * 3 channels
    const int subsampling = TJSAMP_444;
    size_t jpeg_length = 0; // tjCompress2 will allocate the jpeg_image buffer
    jpeg_image = nullptr;
 #ifdef TIME_JPEG
    struct timespec start_time, end_time;
    clock_gettime(CLOCK_MONOTONIC, &start_time);
 #endif
    if (m_jpeg_compressor == nullptr) {
      LOG(ERROR) << "Cannot initialize JPEG compressor";
      return 0;
    }
    int jpeg_error = tjCompress2(m_jpeg_compressor,
                                 (unsigned char*) src_buffer,
                                 width,
                                 0,
                                 height,
                                 TJPF_RGB,
                                 &jpeg_image,
                                 (unsigned long *)&jpeg_length,
                                 subsampling,
                                 compression_quality,
                                 TJFLAG_FASTDCT);
    if (jpeg_error < 0) {
      const char *jpeg_error_str = tjGetErrorStr();
      LOG(ERROR) << "JPEG compression error: " << jpeg_error_str;
      return 0;
    }
 #ifdef TIME_JPEG
    clock_gettime(CLOCK_MONOTONIC, &end_time);
    // ms time
    double elapsed_time = (end_time.tv_nsec - start_time.tv_nsec) / 1e6;
    LOG(INFO) << "TIMING: JPEG compression: " << elapsed_time << "ms"
        << ", resolution " << w << "x" << h
        << ", sizes " << src_buffer.size() << " (" << jpeg_length << ")";
 #endif
    return jpeg_length;
 }
 bool TurbojpegCompressor::decompress(std::vector<uint8_t> & jpeg_image_buffer, int & width_out, int & height_out,
    std::vector<cgr_libcluster::uchar3> & decompressed_image_out) {
 #ifdef TIME_JPEG
  struct timespec start_time, end_time;
  clock_gettime(CLOCK_MONOTONIC, &start_time);
 #endif
  // Use TurboJPEG to decompress the buffer
  int subsampling = 0;
  if (m_jpeg_decompressor == nullptr) {
    LOG(ERROR) << "Cannot initialize JPEG decompressor";
    return false;
  }
  int width = 0, height = 0;
  int jpeg_error = tjDecompressHeader2(m_jpeg_decompressor, jpeg_image_buffer.data(),
          jpeg_image_buffer.size(), &width, &height, &subsampling);
  if (jpeg_error < 0) {
    LOG(ERROR) << "Cannot decode JPEG header from incoming image buffer";
    return false;
  }
  width_out = width;
  height_out = height;
  const size_t num_pixels = width_out * height_out;
  if(decompressed_image_out.size() != num_pixels) {
    decompressed_image_out.resize(num_pixels);
  }
  jpeg_error = tjDecompress2(m_jpeg_decompressor,
                              jpeg_image_buffer.data(),
                              jpeg_image_buffer.size(),
                              (unsigned char*) decompressed_image_out.data(),
                              width_out,
                              0,
                              height_out,
                              TJPF_RGB,
                              TJFLAG_ACCURATEDCT);
  //tjDestroy(jpeg_decompressor); // move to d-tor?
  if (jpeg_error < 0) {
    const char *jpeg_error_str = tjGetErrorStr();
    LOG(ERROR) << "JPEG decompression error: " << jpeg_error_str;
    return false;
  }
 #ifdef TIME_JPEG
  clock_gettime(CLOCK_MONOTONIC, &end_time);
  // ms time
  double elapsed_time = (end_time.tv_nsec - start_time.tv_nsec) / 1e6;
  LOG(INFO) << "TIMING: JPEG decompression: " << elapsed_time << "ms"
      << ", resolution " << w << "x" << h
      << ", sizes " << jpeg_image_buffer.size() << " (" << dst_buffer.size() << ")";
 #endif
    return true;
 }
 void TurbojpegCompressor::free(uint8_t *memory) {
  tjFree(memory);
 }
 } // cgr_libcluster
--- a/intern/cycles/cluster_rendering/libcluster/compression/turbojpeg_compressor.h
+++ b/intern/cycles/cluster_rendering/libcluster/compression/turbojpeg_compressor.h
@ -0,0 +1,34 @@
 #ifndef __TURBOJPEG_COMPRESSOR_H__
 #define __TURBOJPEG_COMPRESSOR_H__
 #include <vector>
 #include <turbojpeg.h>
 #include "../utils/vector_types.h"    // for uchar3
 //#include "net_camera.h"
 namespace cgr_libcluster {
 class TurbojpegCompressor {
 public:
  TurbojpegCompressor();
  ~TurbojpegCompressor();
  // Compress the image buffer into a jpeg image
  size_t compress(void* src_buffer, int width, int height, int compression_quality, unsigned char*& jpeg_image) ;
  // Decompress a jpeg stream cbuffer into the StreamedImage
  bool decompress(std::vector<uint8_t> & jpeg_image_buffer, int & width_out, int & height_out,
    std::vector<cgr_libcluster::uchar3> & decompressed_image_out);
  void free(uint8_t *memory);
 private:
  tjhandle m_jpeg_compressor = nullptr;
  tjhandle m_jpeg_decompressor = nullptr;
 };
 } // cgr_libcluster
 #endif
--- a/intern/cycles/cluster_rendering/libcluster/cuda_context_provider.h
+++ b/intern/cycles/cluster_rendering/libcluster/cuda_context_provider.h
@ -0,0 +1,92 @@
 #ifndef __CUDA_CONTEXT_PROVIDER_H__
 #define __CUDA_CONTEXT_PROVIDER_H__
 #include "./utils/logging.h"
 #ifdef WITH_CUDA_DYNLOAD
  #include <cuew.h>
  // Do not use CUDA SDK headers when using CUEW
  // The macro below is used by Optix SDK and is necessary to avoid DSO loading collision
  // See device_optix.cpp for example.
  #define OPTIX_DONT_INCLUDE_CUDA
 #  else
 #    include <cuda.h>
 #endif
 namespace cgr_libcluster {
 struct CUDAContextScope {
  CUDAContextScope(CUcontext ctx) {
    CUresult cu_result = cuCtxPushCurrent(ctx);
    if(cu_result != CUDA_SUCCESS) {
      const char *error_name = NULL;
      cuGetErrorName(cu_result, &error_name);
      std::string message = "FATAL. cuCtxPushCurrent failed with error: " + std::string(error_name);
      LOG(ERROR) << message;
      throw std::runtime_error(message);
    }
  }
  ~CUDAContextScope() {
    CUresult cu_result = cuCtxPopCurrent(NULL);
    if(cu_result != CUDA_SUCCESS) {
      const char *error_name = NULL;
      cuGetErrorName(cu_result, &error_name);
      std::string message = "FATAL. cuCtxPopCurrent failed with error: " + std::string(error_name);
      LOG(ERROR) << message;
      throw std::runtime_error(message);
    }
  }
 };
 class CudaContextProvider {
 public:
  static CUcontext getPrimaryContext(const int device_num) {
    static CUcontext cuda_context = 0;
    if(cuda_context) {
      return cuda_context;
    }
  #ifdef WITH_CUDA_DYNLOAD
    LOG(INFO) << "WITH_CUDA_DYNLOAD is on, call cuewInit to load CUDA library";
    // When we run as a part of the blender the CUDA lib is normally already loaded by CUDADevice
    // However it's not loaded when is called from unit tests so call cuewInit here to be sure
    // that CUDA lib is loaded, it's safe to call cuewInit multiple times
    if (cuewInit(CUEW_INIT_CUDA) != CUEW_SUCCESS) {
      throw std::runtime_error("Error. CUEW failed to load CUDA lib");
    }
  #endif
    CUdevice cuda_device = 0;
    CUresult cu_result = cuDeviceGet(&cuda_device, device_num);
    // CUDA is normally initialised by CUDADevice however it's not initialised
    // when is called from unit tests, check if CUDA is initialised, run cuInit if not.
    if(cu_result == CUDA_ERROR_NOT_INITIALIZED) {
      LOG(WARNING) << "Cuda is not initialised, run cuInit";
      cu_result = cuInit(0);
      if (cu_result != CUDA_SUCCESS) {
        std::string message = "Error. Cannot initialise CUDA, cuInit failed "
          "with error code: "  + std::to_string(cu_result);
        throw std::runtime_error(message);  
      }
      cu_result = cuDeviceGet(&cuda_device, device_num);
    }
    if(cu_result != CUDA_SUCCESS) {
      std::string message = "Error. Cannot get primary cuda context due to cuDeviceGet failed "
        "with error code: " + std::to_string(cu_result);
      throw std::runtime_error(message);
    }
    cu_result = cuDevicePrimaryCtxRetain(&cuda_context, cuda_device);
    if(cu_result != CUDA_SUCCESS) {
      std::string message = "Error. Failed to get primary cuda context "
        "with error code: "  + std::to_string(cu_result);
      throw std::runtime_error(message);
    }
    return cuda_context;
  }
 };
 } // namespace cgr_libcluster
 #endif
--- a/intern/cycles/cluster_rendering/libcluster/denoising/denoising_context.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/denoising/denoising_context.cpp
@ -1,3 +1,4 @@
 #include "../cuda_context_provider.h"
 #include "denoising_context.h"
 #include "master_oidn_denoiser.h"
 #ifdef WITH_OPTIX
@ -26,7 +27,8 @@ MasterDenoiser * DenoisersProvider::makeOptixDenoiser(bool save_denoise_io, int
    const std::string & output_folder_path, bool is_denoising_passes_on,
    const ImageOutputProvider & image_output_provider, int max_img_width, int max_img_height) const {
    try {
-      return new MasterOptixDenoiser(save_denoise_io, save_every_n_images, output_folder_path,
+      CUcontext cuda_context = CudaContextProvider::getPrimaryContext(MasterOptixDenoiser::DEVICE_NUM);
      return new MasterOptixDenoiser(cuda_context, save_denoise_io, save_every_n_images, output_folder_path,
        is_denoising_passes_on, image_output_provider, max_img_width, max_img_height);
    } catch(std::runtime_error & ex) {
      LOG(ERROR) << "DenoisersProvider::makeDenoiser. ERROR. Failed to create an instance of MasterOptixDenoiser due to: "
--- a/intern/cycles/cluster_rendering/libcluster/denoising/master_optix_denoiser.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/denoising/master_optix_denoiser.cpp
@ -1,3 +1,6 @@
 #include "../cuda_context_provider.h"
 #include "../utils/cuda_utils.h"
 #include "../utils/timer.h" // for scoped_timer
 #include "../utils/logging.h"
 #include "../server_image.h" // for ServerImage;
@ -7,28 +10,24 @@
 #include <optix_denoiser_tiling.h>
 #include <optix.h>
 namespace cgr_libcluster {
 struct CUDAContextScope {
  CUDAContextScope(CUcontext ctx) {
    cuCtxPushCurrent(ctx);
  }
  ~CUDAContextScope() {
    cuCtxPopCurrent(NULL);
  }
 };
 // Constructor throws exception if it cannot create an instance.
 // This prevents creation of the object that is not functional.
 // Doesn't affect rendering performance as happens once at init stage
-MasterOptixDenoiser::MasterOptixDenoiser(bool save_denoise_io, int save_every_n_images, const std::string & output_folder_path,
+MasterOptixDenoiser::MasterOptixDenoiser(CUcontext cuda_context, bool save_denoise_io, int save_every_n_images,
-    bool is_denoising_passes_on, const ImageOutputProvider & image_output_provider, int max_img_width, int max_img_height) :
+    const std::string & output_folder_path, bool is_denoising_passes_on,
    const ImageOutputProvider & image_output_provider, int max_img_width, int max_img_height) :
    MasterDenoiser(
      ClusterSessionParams::MasterDenoiser::MASTER_DENOISER_OPTIX, save_denoise_io, save_every_n_images,
      output_folder_path, is_denoising_passes_on, image_output_provider),
-      max_img_width(max_img_width), max_img_height(max_img_height) {
+      max_img_width(max_img_width), max_img_height(max_img_height),
-  LOG(INFO) << "Creating MasterOptixDenoiser";
+      cuda_context(cuda_context) {
  LOG(INFO) << "Creating MasterOptixDenoiser with width: " << max_img_width << " height: " << max_img_height <<
    " save_io: " << save_denoise_io << " save_every_n_images: " << save_every_n_images << " output_folder_path: " <<
    output_folder_path << " is_denoising_passes_on: " << is_denoising_passes_on;
  OptixDeviceContext optix_context = nullptr;
  OptixDenoiserOptions denoiser_options;
  if(is_denoising_passes_on) {
@ -39,166 +38,69 @@ MasterOptixDenoiser::MasterOptixDenoiser(bool save_denoise_io, int save_every_n_
    denoiser_options.guideNormal = 0;
  }
  //denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
-  CUdevice cuda_device = 0;
+
  OptixDeviceContextOptions options = {};
 #ifdef WITH_CUDA_DYNLOAD
  LOG(INFO) << "WITH_CUDA_DYNLOAD is on, call cuewInit to load CUDA library";
  // When we run as a part of the blender the CUDA lib is normally already loaded by CUDADevice
  // However it's not loaded when is called from unit tests so call cuewInit here to be sure
  // that CUDA lib is loaded, it's safe to call cuewInit multiple times
  if (cuewInit(CUEW_INIT_CUDA) != CUEW_SUCCESS) {
    throw std::runtime_error("Error. CUEW failed to load CUDA lib");
  }
 #endif
  CUresult cu_result = cuDeviceGet(&cuda_device, DEVICE_NUM);
  // CUDA is normally initialised by CUDADevice however it's not initialised
  // when is called from unit tests, check if CUDA is initialised, run cuInit if not.
  if(cu_result == CUDA_ERROR_NOT_INITIALIZED) {
    LOG(WARNING) << "Cuda is not initialised, run cuInit";
    cu_result = cuInit(0);
    if (cu_result != CUDA_SUCCESS) {
      std::string message = "Error. Cannot initialise CUDA, cuInit failed "
        "with error code: "  + std::to_string(cu_result);
      throw std::runtime_error(message);
    }
    cu_result = cuDeviceGet(&cuda_device, DEVICE_NUM);
  }
  if(cu_result != CUDA_SUCCESS) {
    std::string message = "Error. Cannot create MasterOptixDenoiser due to cuDeviceGet failed "
      "with error code: " + std::to_string(cu_result);
    throw std::runtime_error(message);
  }
  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
  cu_result = cuCtxCreate(&cuda_context, ctx_flags, cuda_device);
  if(cu_result != CUDA_SUCCESS) {
    std::string message = "Error. Cannot create MasterOptixDenoiser due to cuCtxCreate failed "
      "with error code: "  + std::to_string(cu_result);
    throw std::runtime_error(message);
  }
  const CUDAContextScope scope(cuda_context);
  if (g_optixFunctionTable.optixDeviceContextCreate != NULL) {
    LOG(INFO) << "Optix function table is already initialized, continue.";
  } else {
    LOG(INFO) << "Initializing Optix...";
-    OptixResult optix_result = optixInit();
+    OPTIX_API_CALL(optixInit(), THROW_IF_ERROR);
    if(optix_result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
      std::string message = "Error. OptiX initialization failed because the installed driver does not support "
      "ABI version: "  + std::to_string(OPTIX_ABI_VERSION);
      throw std::runtime_error(message);
    }
    else if(optix_result != OPTIX_SUCCESS) {
      std::string message = "Error. OptiX initialization failed with error code: " + std::to_string(optix_result);
      throw std::runtime_error(message);
    }
    LOG(INFO) << "Initialization of Optix function table complete";
  }
-
+  OPTIX_API_CALL(optixDeviceContextCreate(cuda_context, &options, &optix_context), THROW_IF_ERROR);
  OptixResult optix_result = optixDeviceContextCreate(cuda_context, &options, &optix_context);
  if(optix_result != OPTIX_SUCCESS) {
    std::string message = "Error. Cannot create MasterOptixDenoiser due to optixDeviceContextCreate failed "
      "with error code: "  + std::to_string(optix_result);
    throw std::runtime_error(message);
  }
  //TODO:  figure out how to support KIND_TEMPORAL, needs vecocity vector
-  optix_result = optixDenoiserCreate(optix_context, OPTIX_DENOISER_MODEL_KIND_HDR,
+  OPTIX_API_CALL(optixDenoiserCreate(optix_context, OPTIX_DENOISER_MODEL_KIND_HDR,
-    &denoiser_options, &denoiser);
+    &denoiser_options, &denoiser), THROW_IF_ERROR);
  if(optix_result != OPTIX_SUCCESS) {
    std::string message = "Error. Cannot create MasterOptixDenoiser due to optixDeviceContextCreate failed "
      "with error code: "  + std::to_string(optix_result);
    throw std::runtime_error(message);
  }
  if(denoiser == nullptr) {
    throw std::runtime_error("Error. Cannot create MasterOptixDenoiser due to optixDenoiserCreate returned no denoiser");
  }
-  /*
+  // OPTIX_API_CALL(optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, nullptr, 0), THROW_IF_ERROR);
-  optix_result = optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, nullptr, 0);
+
  if(optix_result != OPTIX_SUCCESS) {
    std::string message = "Error. Cannot create MasterOptixDenoiser due to optixDenoiserSetModel failed "
      "with error code: "  + std::to_string(optix_result);
    throw std::runtime_error(message);
  }
  */
  memset(&denoiser_sizes, 0, sizeof(OptixDenoiserSizes));
-  optix_result = optixDenoiserComputeMemoryResources(denoiser, max_img_width, max_img_height, &denoiser_sizes);
+  OPTIX_API_CALL(optixDenoiserComputeMemoryResources(denoiser, max_img_width, max_img_height, &denoiser_sizes),
-  if(optix_result != OPTIX_SUCCESS) {
+    THROW_IF_ERROR);
    std::string message = "Error. Cannot create MasterOptixDenoiser due to optixDenoiserComputeMemoryResources failed "
      "with error code: "  + std::to_string(optix_result);
    throw std::runtime_error(message);
  }
-  cu_result = cuMemAlloc(&state_denoiser, denoiser_sizes.stateSizeInBytes);
+  CUDA_API_CALL(cuMemAlloc(&state_denoiser, denoiser_sizes.stateSizeInBytes), THROW_IF_ERROR);
-  if(cu_result != CUDA_SUCCESS) {
+  CUDA_API_CALL(cuMemAlloc(&scratch_denoiser, denoiser_sizes.withoutOverlapScratchSizeInBytes), THROW_IF_ERROR);
-    std::string message = "Error. Cannot create MasterOptixDenoiser due to cuMemAlloc for state_denoiser failed "
+  CUDA_API_CALL(cuStreamCreate(&cuda_stream, CU_STREAM_DEFAULT), THROW_IF_ERROR);
-      "with error code: "  + std::to_string(cu_result);
+  OPTIX_API_CALL(optixDenoiserSetup(
    throw std::runtime_error(message);
  }
  cu_result = cuMemAlloc(&scratch_denoiser, denoiser_sizes.withoutOverlapScratchSizeInBytes);
  if(cu_result != CUDA_SUCCESS) {
    std::string message = "Error. Cannot create MasterOptixDenoiser due to cuMemAlloc for scratch_denoiser failed "
      "with error code: "  + std::to_string(cu_result);
    throw std::runtime_error(message);
  }
  optix_result = optixDenoiserSetup(
    denoiser,
-    0, // cuda stream,
+    cuda_stream,
    max_img_width,
    max_img_height,
    state_denoiser,
    denoiser_sizes.stateSizeInBytes,
    scratch_denoiser,
-    denoiser_sizes.withoutOverlapScratchSizeInBytes);
+    denoiser_sizes.withoutOverlapScratchSizeInBytes), THROW_IF_ERROR);
  if(optix_result != OPTIX_SUCCESS) {
    std::string message = "Error. Cannot create MasterOptixDenoiser due to optixDenoiserSetup failed "
      "with error code: "  + std::to_string(optix_result);
    throw std::runtime_error(message);
  }
  allocateCudaBuffer(&cuda_pixels_buffer, getMaxBufferSize(), "input/noisy pixels buffer");
  if(is_denoising_passes_on) {
    allocateCudaBuffer(&cuda_albedo_buffer, getMaxBufferSize(), "input albedo buffer");
    allocateCudaBuffer(&cuda_normal_buffer, getMaxBufferSize(), "input normal buffer");
  }
  allocateCudaBuffer(&cuda_denoised_pixels_buffer, getMaxBufferSize(), "denoised pixels buffer");
  LOG(INFO) << "MasterOptixDenoiser creation is complete";
 }
 MasterOptixDenoiser::~MasterOptixDenoiser() {
  LOG(INFO) << "Destroing MasterOptixDenoiser";
-  CUresult cu_result = cuMemFree(scratch_denoiser);
+  const CUDAContextScope scope(cuda_context);
-  if(cu_result != CUDA_SUCCESS) {
+  CUDA_API_CALL(cuMemFree(scratch_denoiser), DO_NOT_THROW);
-    LOG(ERROR) << "Error. cuMemFree failed for scratch_denoiser with error code: " << std::to_string(cu_result);
+  CUDA_API_CALL(cuMemFree(state_denoiser), DO_NOT_THROW);
-  }
+  CUDA_API_CALL(cuStreamDestroy(cuda_stream), DO_NOT_THROW);
-  cu_result = cuMemFree(state_denoiser);
+  OPTIX_API_CALL(optixDenoiserDestroy(denoiser), DO_NOT_THROW);
  if(cu_result != CUDA_SUCCESS) {
    LOG(ERROR) << "Error. cuMemFree failed for state_denoiser with error code: " << std::to_string(cu_result);
  }
  OptixResult optix_result = optixDenoiserDestroy(denoiser);
  if(optix_result != OPTIX_SUCCESS) {
    LOG(ERROR) << "Error. optixDenoiserDestroy failed with error code: " << optix_result;
  }
  releaseCudaBuffers();
  cu_result = cuCtxDestroy(cuda_context);
  if(cu_result != CUDA_SUCCESS) {
    LOG(ERROR) << "Error. cuCtxDestroy failed with error code: " << std::to_string(cu_result);
  }
  LOG(INFO) << "Destroyed MasterOptixDenoiser";
 }
 void MasterOptixDenoiser::allocateCudaBuffer(CUdeviceptr * buffer_ptr, size_t buffer_size, const std::string & buffer_name) {
  LOG(INFO) << "Allocating cuda memory for " << buffer_name;
-  CUresult cu_result = cuMemAlloc(buffer_ptr, buffer_size);
+  CUDA_API_CALL(cuMemAlloc(buffer_ptr, buffer_size), THROW_IF_ERROR);
  if(cu_result != CUDA_SUCCESS) {
    std::string message = "Error. Could not allocate memory for " + buffer_name + " on device. Cuda error code: "
      + std::to_string(cu_result);
    throw std::runtime_error(message);
  }
 }
 size_t MasterOptixDenoiser::getMaxBufferSize() {
@ -206,32 +108,17 @@ size_t MasterOptixDenoiser::getMaxBufferSize() {
 }
 void MasterOptixDenoiser::releaseCudaBuffers() {
-  CUresult cu_result = cuMemFree(cuda_pixels_buffer);
+  CUDA_API_CALL(cuMemFree(cuda_pixels_buffer), DO_NOT_THROW);
  if(cu_result != CUDA_SUCCESS) {
    "Error. cuMemFree failed for cuda_pixels_buffer with error code:" + std::to_string(cu_result);
  }
  cuda_pixels_buffer = 0;
  if(cuda_albedo_buffer) {
-    cu_result = cuMemFree(cuda_albedo_buffer);
+    CUDA_API_CALL(cuMemFree(cuda_albedo_buffer), DO_NOT_THROW);
    if(cu_result != CUDA_SUCCESS) {
      "Error. cuMemFree failed for cuda_albedo_buffer with error code:" + std::to_string(cu_result);
    }
    cuda_albedo_buffer = 0;
  }
  if(cuda_normal_buffer) {
-    cu_result = cuMemFree(cuda_normal_buffer);
+    CUDA_API_CALL(cuMemFree(cuda_normal_buffer), DO_NOT_THROW);
    if(cu_result != CUDA_SUCCESS) {
      "Error. cuMemFree failed for cuda_normal_buffer with error code:" + std::to_string(cu_result);
    }
    cuda_normal_buffer = 0;
  }
-
+  CUDA_API_CALL(cuMemFree(cuda_denoised_pixels_buffer), DO_NOT_THROW);
  cu_result = cuMemFree(cuda_denoised_pixels_buffer);
  if(cu_result != CUDA_SUCCESS) {
    "Error. cuMemFree failed for cuda_denoised_pixels_buffer with error code:" + std::to_string(cu_result);
  }
  cuda_denoised_pixels_buffer = 0;
 }
@ -252,11 +139,7 @@ bool MasterOptixDenoiser::initInputOptixImage(OptixImage2D &input_image, CUdevic
  const int row_stride = server_image.getWidth() * pixel_stride;
  const size_t buffer_size_bytes = sizeof(cgr_libcluster::float3) * server_image.getWidth() * server_image.getHeight();
-  CUresult cu_result = cuMemcpyHtoD(input_buffer_device, input_buffer_host, buffer_size_bytes);
+  CUDA_API_CALL(cuMemcpyHtoD(input_buffer_device, input_buffer_host, buffer_size_bytes), THROW_IF_ERROR);
  if(cu_result != CUDA_SUCCESS) {
    LOG(ERROR) << "Error. Could not copy input buffer from host to device memory. CUDA error code: " << cu_result;
    return false;
  }
  input_image.data = input_buffer_device;
  input_image.width  =  server_image.getWidth();
  input_image.height = server_image.getHeight();
@ -267,6 +150,7 @@ bool MasterOptixDenoiser::initInputOptixImage(OptixImage2D &input_image, CUdevic
 }
 DenoisingResult MasterOptixDenoiser::denoise(ServerImage &server_image) {
  const CUDAContextScope scope(cuda_context);
  if(save_denoise_io) {
    if (save_every_n_images <= 0) {
      LOG(ERROR) << "Error: invalid save_every_n_images: " << save_every_n_images;
@ -314,7 +198,7 @@ DenoisingResult MasterOptixDenoiser::denoise(ServerImage &server_image) {
  OptixResult optix_result = optixDenoiserInvoke(
    denoiser,
-    0, // cuda stream
+    cuda_stream,
    &params_denoiser,
    state_denoiser,
    denoiser_sizes.stateSizeInBytes,
@ -329,24 +213,28 @@ DenoisingResult MasterOptixDenoiser::denoise(ServerImage &server_image) {
    LOG(ERROR) << "Error. Optix denoise failed. OPTIX error code: " << optix_result;
    return DenoisingResult::FAILED;
  }
-
+  // explicitly wait till denoising is complete
-  const size_t buffer_size_bytes = sizeof(cgr_libcluster::float3) * server_image.getWidth() *  server_image.getHeight();
+  cuStreamSynchronize(cuda_stream);
-  CUresult cu_result = cuMemcpyDtoH(server_image.denoised_pixels, cuda_denoised_pixels_buffer, buffer_size_bytes);
+  denoised_buffer_size_bytes = sizeof(cgr_libcluster::float3) * server_image.getWidth() * server_image.getHeight();
  if(cu_result != CUDA_SUCCESS) {
    LOG(ERROR) << "Error. Could not copy image buffer from device to host memory. CUDA error code: " << cu_result;
    return DenoisingResult::FAILED;
  }
  server_image.denoised = true;
  LOG(INFO) << "denoising time for frame: " << server_image.camera.frame << " " << denoizer_timer.elapsed();
-  if(save_denoise_io) {
+  return DenoisingResult::OK;
-    if (save_every_n_images <= 0) {
+}
-      LOG(ERROR) << "Error: invalid save_every_n_images: " << save_every_n_images;
+
-    } else if (server_image.getFrame() % save_every_n_images == 0) {
+CUdeviceptr MasterOptixDenoiser::getCudaMemPointerToDenoisedImage() const {
-      saveDenoisedImage(server_image);
+  return cuda_denoised_pixels_buffer;
-    }
+}
-  }
+
-  return server_image.denoised ? DenoisingResult::OK : DenoisingResult::FAILED;
+CUstream MasterOptixDenoiser::getCudaStream() const {
  return cuda_stream;
 }
 void MasterOptixDenoiser::copyDenoisedImageFromCudaToHostMemory(uint8_t * dest_host_memory) {
  scoped_timer copy_device_2_host_timer;
  const CUDAContextScope scope(cuda_context);
  CUDA_API_CALL(cuMemcpyDtoHAsync(dest_host_memory, cuda_denoised_pixels_buffer,
    denoised_buffer_size_bytes, cuda_stream), THROW_IF_ERROR);
  CUDA_API_CALL(cuStreamSynchronize(cuda_stream), THROW_IF_ERROR);
  LOG(INFO) << "time to copy denoised image from device to host: " << copy_device_2_host_timer.elapsed();
 }
 } // cgr_libcluster
--- a/intern/cycles/cluster_rendering/libcluster/denoising/master_optix_denoiser.h
+++ b/intern/cycles/cluster_rendering/libcluster/denoising/master_optix_denoiser.h
@ -8,6 +8,8 @@
  // The macro below is used by Optix SDK and is necessary to avoid DSO loading collision
  // See device_optix.cpp for example.
  #define OPTIX_DONT_INCLUDE_CUDA
 #else
  #include <cuda.h>
 #endif
 #include <optix_stubs.h>
@ -32,21 +34,27 @@ public:
  static const size_t DEFAULT_IMAGE_WIDTH_PIXELS = 1024;
  static const size_t DEFAULT_IMAGE_HEIGHT_PIXELS = 1024;
-  MasterOptixDenoiser(bool save_denoise_io, int save_every_n_images, const std::string & output_folder_path,
+  MasterOptixDenoiser(CUcontext cuda_context, bool save_denoise_io, int save_every_n_images,
-    bool is_denoising_passes_on, const ImageOutputProvider & image_output_provider, int max_img_width, int max_img_height);
+    const std::string & output_folder_path, bool is_denoising_passes_on,
    const ImageOutputProvider & image_output_provider, int max_img_width, int max_img_height);
  virtual ~MasterOptixDenoiser();
  virtual DenoisingResult denoise(ServerImage &server_image) override;
  MasterOptixDenoiser(MasterOptixDenoiser const&) = delete;
  void operator=(MasterOptixDenoiser const&) = delete;
  CUstream getCudaStream() const;
  CUdeviceptr getCudaMemPointerToDenoisedImage() const;
  void copyDenoisedImageFromCudaToHostMemory(uint8_t * dest_host_memory);
 private:
  // For denoising we always use device # 0
  // rendering can be assigned to devices starting from #1
  // so denoising and rendering do not run on the same device and do not fight for GPU resources
  static const int DEVICE_NUM = 0;
 private:
  void allocateCudaBuffer(CUdeviceptr * buffer_ptr, size_t buffer_size, const std::string & buffer_name);
  void releaseCudaBuffers();
  size_t getMaxBufferSize();
@ -54,6 +62,18 @@ private:
    ServerImage &server_image, ImagePixel * input_buffer_host);
  void initOutputOptixImage(OptixImage2D &output_image, ServerImage &server_image);
  // We set cuda stream explicitly so components which use denoised image down the stream
  // like pixel conversion and image compression can run within the same cuda stream
  // and therefore be implicitly synchronized for the best possible performance.
  // For now we synchronize explicitly by calling cuStreamSynchronize() in the denoise()
  // so denoise time logging outputs correct denoising duration. Denosing time calculation
  // and logging needs to be updated upon switching to implicit synchronization by CUDA runtime.
  // Here is a task for that improvement: T143205954.
  // Currently cuda stream is owned by this class for simplicity. Alternatively cuda context
  // can be passed from outside but this requires more changes in ServerImageBuffer and DenoisingContext.
  // Not making this change at this point to limit the scope to the nvdecoder integration,
  // this change can be done as part of the task for synchronization improvement mentioned above.
  CUstream cuda_stream = nullptr;
  CUcontext cuda_context = nullptr;
  OptixDenoiser denoiser = nullptr;
  OptixDenoiserSizes denoiser_sizes;
@ -64,7 +84,11 @@ private:
  CUdeviceptr cuda_pixels_buffer = 0;
  CUdeviceptr cuda_albedo_buffer = 0;
  CUdeviceptr cuda_normal_buffer = 0;
  CUdevice m_cuda_device;
  CUdeviceptr cuda_denoised_pixels_buffer = 0;
  size_t denoised_buffer_size_bytes = 0;
  int max_img_width = DEFAULT_IMAGE_WIDTH_PIXELS;
  int max_img_height = DEFAULT_IMAGE_HEIGHT_PIXELS;
 };
--- a/intern/cycles/cluster_rendering/libcluster/net_camera.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/net_camera.cpp
@ -90,6 +90,7 @@ bool NetCamera::operator==(const NetCamera &other_camera) const
    this->compression_quality == other_camera.compression_quality &&
    this->master_denoiser == other_camera.master_denoiser &&
    this->master_image_color_format == other_camera.master_image_color_format &&
    this->master_image_compressor == other_camera.master_image_compressor &&
    this->worker_map == other_camera.worker_map;
 }
--- a/intern/cycles/cluster_rendering/libcluster/net_camera.fbs
+++ b/intern/cycles/cluster_rendering/libcluster/net_camera.fbs
@ -3,6 +3,7 @@ namespace cgr_libcluster;
 enum CameraTypeFlatBuffer:byte { CAMERA_PERSPECTIVE = 0, CAMERA_ORTHOGRAPHIC, CAMERA_PANORAMA }
 enum MasterDenoiserFlatBuffer:byte { MASTER_DENOISER_NONE = 0, MASTER_DENOISER_OIDN, MASTER_DENOISER_OPTIX, MASTER_DENOISER_BARCELONA }
 enum MasterImageColorFormatFlatBuffer:byte { MASTER_IMAGE_COLOR_FORMAT_LINEAR = 1, MASTER_IMAGE_COLOR_FORMAT_SRGB = 2}
 enum MasterImageCompressorFlatBuffer:byte { MASTER_IMAGE_COMPRESSOR_JPEG = 1, MASTER_IMAGE_COMPRESSOR_NVENCODER = 2}
 table NetCameraFlatBuffer {
  cam_matrix:TransformFlatBuffer;
@ -28,6 +29,7 @@ table NetCameraFlatBuffer {
  worker_map:WorkerMapFlatBuffer;
  scene_frame:int;
  master_image_color_format:MasterImageColorFormatFlatBuffer = MASTER_IMAGE_COLOR_FORMAT_LINEAR;
  master_image_compressor:MasterImageCompressorFlatBuffer = MASTER_IMAGE_COMPRESSOR_JPEG;
 }
 table TransformFlatBuffer {
--- a/intern/cycles/cluster_rendering/libcluster/net_camera.h
+++ b/intern/cycles/cluster_rendering/libcluster/net_camera.h
@ -90,6 +90,8 @@ public:
  bool expect_modify_object_message = false;
  ClusterSessionParams::MasterImageColorFormat master_image_color_format =
    ClusterSessionParams::DEFAULT_MASTER_IMAGE_COLOR_FORMAT;
  ClusterSessionParams::MasterImageCompressor master_image_compressor =
    ClusterSessionParams::DEFAULT_MASTER_IMAGE_COMPRESSOR;
 };
--- a/intern/cycles/cluster_rendering/libcluster/net_server.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/net_server.cpp
@ -8,6 +8,16 @@
 #include "denoising/denoising_context.h"
 #include "streamed_image.h"
 // Not sure that this is the best place to set OS_LINUX
 // IMO it should be set somewhere on higher level
 // setting it here for now to get system built.
 // TODO: revisit this flag later
 #if defined(linux) || defined(__linux) || defined(__linux__)
 # ifndef OS_LINUX
 #  define OS_LINUX
 # endif
 #endif
 namespace cgr_libcluster {
 NetServer::~NetServer()
@ -54,8 +64,8 @@ void NetServer::stop()
      netThread = new std::thread(std::bind(&NetServer::run, this));
    }
  }
-  tbb::mutex::scoped_lock imageStoplock(serverImage.serverImageStopMutex);
+  tbb::mutex::scoped_lock imageStoplock(server_image_buffer.serverImageStopMutex);
-  serverImage.reset();
+  server_image_buffer.reset();
 }
 void NetServer::createStreamedImage(unsigned int readIndex, ServerImage &currentImage, DenoisingContext & denoising_context)
@ -65,7 +75,7 @@ void NetServer::createStreamedImage(unsigned int readIndex, ServerImage &current
 #ifndef WORKER_IMAGE_STREAMING_TEST
    mergeWithWorkers = theClient->mergeWith(currentImage);
 #endif
-    serverImage.createStreamedImage(readIndex, mergeWithWorkers, denoising_context);
+    server_image_buffer.createStreamedImage(readIndex, mergeWithWorkers, denoising_context);
  }
 }
@ -87,6 +97,7 @@ void NetServer::set_save_every_n_images(int save_every_n_images) {
 void NetServer::set_output_folder_path(const std::string & output_folder_path) {
  this->output_folder_path = output_folder_path;
  server_image_buffer.set_output_folder_path(output_folder_path);
 }
 void NetServer::run()
@ -100,13 +111,15 @@ void NetServer::run()
  for (;;) {
    if (stopped) break;
-    tbb::mutex::scoped_lock imageStopLock(serverImage.serverImageStopMutex);
+    tbb::mutex::scoped_lock imageStopLock(server_image_buffer.serverImageStopMutex);
-    if (!serverImage.isEmpty()) {
+    if (!server_image_buffer.isEmpty()) {
-      int readIndex = serverImage.readIndex;
+      int readIndex = server_image_buffer.readIndex;
-      LOG(INFO) << "server image indices: " << serverImage.writeIndex << " " << serverImage.readIndex;
+      LOG(INFO) << "server image indices: " << server_image_buffer.writeIndex << " " << server_image_buffer.readIndex;
-      ServerImage& currentImage = serverImage.readImage();
+      ServerImage& currentImage = server_image_buffer.readImage();
      //if has client, combine with correct client image before sending
-      if (isMaster())  createStreamedImage(readIndex, currentImage, denoising_context);
+      if (isMaster())  {
        createStreamedImage(readIndex, currentImage, denoising_context);
      }
      RPCSend streamSnd(image_socket, &error_func, SERVER_STREAM_IMAGE_CMD);
      if (error_func.have_error()) {
@ -114,7 +127,7 @@ void NetServer::run()
      }
      if (isMaster()) {
        // Create a StreamedImage
-        StreamedImage &streamedImage = serverImage.streamedImageBuffer[readIndex];
+        StreamedImage &streamedImage = server_image_buffer.streamedImageBuffer[readIndex];
        send_streamed_image(streamSnd, streamedImage);
        if (save_streamed_image) {
          if (save_every_n_images <= 0) {
@ -135,12 +148,16 @@ void NetServer::run()
      time_sleep(NET_IMAGE_PAUSE);
    }
  } //endfor
 #if defined(OS_LINUX) && defined(WITH_CUDA)
  server_image_buffer.deleteNvEncoder();
 #endif
  VLOG(3) << "Exit NetServer run loop";
 }
 NetServer::NetServer(
   const char *masterAddr, unsigned short masterPort)
    : NetBase(SERVER_PORT, masterAddr),
-      theClient(NULL), serverImage(IMAGE_FRAME_COUNT, true, true),
+      theClient(NULL), server_image_buffer(IMAGE_FRAME_COUNT, true, true),
      masterPort(masterPort), net_camera_command(net_camera, modify_object_message)
 {
  // acceptConnection();
@ -159,7 +176,7 @@ ClusterRenderCommand& NetServer::wait_for_client_command() {
        rcv.read_buffer(&cam, sizeof(cam));
      	LOG(INFO) << "server receive camera for frame: " << cam.frame << " cam_width: " << cam.cam_width << " cam_height: " << cam.cam_height ;
        numCameraReceived++;
-        serverImage.set_master_image_color_format(cam.master_image_color_format);
+        server_image_buffer.set_master_image_color_format(cam.master_image_color_format);
        if(cam.expect_modify_object_message) {
          rcv.read_buffer(&modify_object_message, sizeof(modify_object_message));
          if (theClient) {
@ -180,7 +197,7 @@ ClusterRenderCommand& NetServer::wait_for_client_command() {
          childCam.sampleCount = cam.sampleCount * theClient->getChildrenCount();
          theClient->send_camera(childCam);
        }
-        serverImage.begin_frame(cam);
+        server_image_buffer.begin_frame(cam);
        return net_camera_command;
      } else if (rcv.name == KILL_CMD) {
        LOG(INFO) << "received terminate message...exiting";
@ -200,7 +217,7 @@ bool NetServer::send_tile(const NetRenderTile& rtile)
  //we have to insert NetCamera associated with image
  //before first tile insertion
-  ServerImage* activeImage = serverImage.getActiveImage();
+  ServerImage* activeImage = server_image_buffer.getActiveImage();
  assert(activeImage);
  if (activeImage) {
    //no locking because tile insertion are for different elements in the buffer
@ -208,7 +225,7 @@ bool NetServer::send_tile(const NetRenderTile& rtile)
    if (activeImage->tile_count == getMaxTileCount()) {
      activeImage->imageCount = 1;
      activeImage->sampleCount = activeImage->camera.sampleCount;
-      serverImage.endInsertImage();
+      server_image_buffer.endInsertImage();
    }
    std::cout << "Insert tile progress: "<< activeImage->tile_count << " "<< getMaxTileCount() << "\n";
    return true;
--- a/intern/cycles/cluster_rendering/libcluster/net_server.h
+++ b/intern/cycles/cluster_rendering/libcluster/net_server.h
@ -22,7 +22,7 @@ class NetServer: public NetBase {
  NetClient *theClient; //client of worker servers
  void createStreamedImage(unsigned int index, ServerImage& curImage, DenoisingContext & denoising_context);
-  ServerImageBuffer serverImage;
+  ServerImageBuffer server_image_buffer;
  unsigned short masterPort;
--- a/intern/cycles/cluster_rendering/libcluster/serializer.h
+++ b/intern/cycles/cluster_rendering/libcluster/serializer.h
@ -23,14 +23,12 @@ public:
    size_t size;
  };
-  // Pass in FlatBuffer otherwise the memory won't exist when the method is exited
+  // Pass in FlatBufferBuilder otherwise the memory won't exist when the method is exited
  static Buffer serialize(flatbuffers::FlatBufferBuilder &builder, const NetCamera & net_camera);
  static void deserialize(uint8_t *buffer_pointer, NetCamera &net_camera_out);
 private:
  static void bufferToFloat4(const Float4FlatBuffer * float4_flatbuffer, cgr_libcluster::float4 & float4_out);
  static void bufferToTransform(const TransformFlatBuffer * cam_matrix_flatbuffer, cgr_libcluster::Transform & cam_matrix_out);
 };
--- a/intern/cycles/cluster_rendering/libcluster/server_image.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/server_image.cpp
@ -125,6 +125,8 @@ size_t ServerImage::read(RPCReceive &rcv)
  NetCamera net_camera;
  rcv.read_buffer(&net_camera, sizeof(NetCamera));
  size_t buf_size = net_camera.cam_width * net_camera.cam_height * sizeof (ImagePixel);
  // log line below is used by get_metrics.py to calculate stats. If you change it
  // please make sure get_metrics.py still works correctly. Update if needed.
  LOG(INFO) << "net camera received for frame: " << net_camera.frame << " " << net_camera.cam_width << " " << net_camera.cam_height;
  rcv.read_buffer(pixels, buf_size);
  LOG(INFO) << "read image for frame: " << net_camera.frame << " with sample: " << net_camera.sampleCount << " of size: " << buf_size;
--- a/intern/cycles/cluster_rendering/libcluster/server_image_buffer.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/server_image_buffer.cpp
@ -1,22 +1,54 @@
 #include "net_base.h"             // for NET_IMAGE_TIMEOUT
 #include <fstream>              // to save video stream
 #include <tbb/parallel_for.h>
 #ifdef WITH_CUDA_DYNLOAD
  #include <cuew.h>
  // Do not use CUDA SDK headers when using CUEW
  // The macro below is used by Optix SDK and is necessary to avoid DSO loading collision
  // See device_optix.cpp for example.
  #define OPTIX_DONT_INCLUDE_CUDA
 #else
  #include <cuda.h>
 #endif
 #ifdef WITH_CUDA
 #include "compression/nv_decoder.h"
 #include "compression/nv_encoder.h"
 #endif
 #include "compression/turbojpeg_compressor.h"
 #include "cuda_context_provider.h"
 #include "./utils/timer.h"            // for time_dt
 #include "denoising/denoising_context.h"
-#include "net_base.h"             // for NET_IMAGE_TIMEOUT
+#ifdef WITH_OPTIX
 #include "denoising/master_optix_denoiser.h"
 #endif
 #include "server_image_buffer.h"
 #include "server_image.h"
 #include "../libcluster_cuda_kernels/gpu_image_utils.h"
 namespace cgr_libcluster {
 const string ServerImageBuffer::VIDEO_FULL_FILE_NAME = "master_nvencoded_video_stream.h264";
 ServerImageBuffer::ServerImageBuffer(int s, bool hasServerImage, bool hasStreamImage):  imageBuffer(NULL), buffer_size(s)
 {
  if (hasServerImage) imageBuffer = new ServerImage[buffer_size];
  if (hasStreamImage) streamedImageBuffer.resize(buffer_size);
  turbojpeg_compressor_uptr.reset(new TurbojpegCompressor());
  reset();
 }
 ServerImageBuffer::~ServerImageBuffer()
 {
  if (imageBuffer) delete []imageBuffer;
 #if defined(OS_LINUX) && defined(WITH_CUDA)
  releaseImageCompressionCudaBuffers();
 #endif
 }
 ServerImage* ServerImageBuffer::getActiveImage() {
@ -29,6 +61,12 @@ ServerImage& ServerImageBuffer::getReadImage() {
  return imageBuffer[readIndex];
 }
 void ServerImageBuffer::set_output_folder_path(const std::string & output_folder_path) {
  this->output_folder_path = output_folder_path;
  encoded_videostream_filename = output_folder_path + "/" + VIDEO_FULL_FILE_NAME;
  VLOG(3) << "Path to the nvencoded video stream file: " << encoded_videostream_filename;
 }
 void ServerImageBuffer::reset()
 {
  readIndex = 0;
@ -95,6 +133,7 @@ void ServerImageBuffer::endInsertImage()
 //but I am leaving it fixed for now
 void ServerImageBuffer::parallel_convert(const ServerImage& src, std::vector<cgr_libcluster::uchar3>& dst)
 {
  scoped_timer pixel_convert_timer;
  int ts = src.getWidth() * src.getHeight();
  int thread_count = PIXEL_THREAD_COUNT;
  int interval = ts/thread_count;
@ -118,6 +157,7 @@ void ServerImageBuffer::parallel_convert(const ServerImage& src, std::vector<cgr
    VLOG(1) << "ERROR. Unknown master image color format. We shuld not get here. Requested color format value: "
 	    << master_image_color_format;
  }
  LOG(INFO) << "pixel conversion time for frame: " << src.camera.frame << " " << pixel_convert_timer.elapsed();
 }
 void ServerImageBuffer::normalizeImage(ServerImage &server_image) {
@ -134,22 +174,220 @@ void ServerImageBuffer::normalizeImage(ServerImage &server_image) {
  }
 }
 void ServerImageBuffer::writeImageToVideoStreamFile(
    const std::vector<uint8_t> &encoded_image) const {
  if(encoded_videostream_filename.length() < 1) {
    return;
  }
  std::ofstream video_stream_file;
  video_stream_file.open(encoded_videostream_filename, std::ios::app | std::ios::binary);
  if(video_stream_file.is_open()) {
    video_stream_file.write(reinterpret_cast<const char*>(encoded_image.data()), encoded_image.size());
    VLOG(3) << "Wrote encoded image of size: " << encoded_image.size();
    video_stream_file.close();
  } else {
    std::string message = "FATAL. Unable to open video stream output file: " + encoded_videostream_filename;
    throw std::invalid_argument(message);
  }
 }
 #if defined(OS_LINUX) && defined(WITH_CUDA)
 void ServerImageBuffer::allocateCudaBuffer(CUdeviceptr * buffer_ptr, size_t buffer_size, const std::string & buffer_name) {
  LOG(INFO) << "Allocating cuda memory for: " << buffer_name;
  CUresult cu_result = cuMemAlloc(buffer_ptr, buffer_size);
  if(cu_result != CUDA_SUCCESS) {
    std::string message = "Error. Could not allocate memory for " + buffer_name + " on device. Cuda error code: "
      + std::to_string(cu_result);
    throw std::runtime_error(message);
  }
 }
 void ServerImageBuffer::resetNvEncoder(const ServerImage &server_image) {
  VLOG(3) << "Resetting NvEncoder";
  CUcontext cuda_context = CudaContextProvider::getPrimaryContext(CUDA_DEVICE_NUM);
  const CUDAContextScope scope(cuda_context);
  // Renderer, optix denoiser and nvencoder share the same cuda context, call cuCtxSynchronize
  // to make sure all operations on cuda context are completed before we start creating NvEncoder
  // not doing this may lead to sporadical problems like nvEncDestroyEncoder call hangs
  cuCtxSynchronize();
  nv_encoder_uptr.reset(new NvEncoder(
    NV_ENC_BUFFER_FORMAT_NV12,
    cuda_context,
    server_image.getWidth(),
    server_image.getHeight()));
  VLOG(3) << "Created NvEncoder successfully";
  releaseImageCompressionCudaBuffers();
  allocateImageCompressionCudaBuffers(server_image);
  VLOG(3) << "Resetting NvEncoder done";
 }
 void ServerImageBuffer::copyServerImageToGpuMemory(const ServerImage & server_image, CUdeviceptr & linear_image_buffer_gpu) {
  CUcontext cuda_context = CudaContextProvider::getPrimaryContext(CUDA_DEVICE_NUM);
  const CUDAContextScope scope(cuda_context);
  uint8_t* linear_image_buffer_cpu = server_image.denoised ? (uint8_t*)server_image.denoised_pixels :
    (uint8_t*)server_image.pixels;
  CUresult cu_result = cuMemcpyHtoD(linear_image_buffer_gpu, linear_image_buffer_cpu, server_image.getBufferSize());
  if(cu_result != CUDA_SUCCESS) {
    std::string message = "Error. Could not copy input buffer from host to device memory. CUDA error code:  "
      + std::to_string(cu_result);
    throw std::runtime_error(message);
  }
 }
 void ServerImageBuffer::allocateImageCompressionCudaBuffers(const ServerImage &server_image) {
  const size_t num_pixels = server_image.getImageSize();
  // 1 byte per color channel
  const int nvencoder_yuv_nv12_buffer_size = num_pixels + (num_pixels)/2;
  allocateCudaBuffer(&nvencoder_input_buffer_gpu, nvencoder_yuv_nv12_buffer_size,
    "input cuda buffer for nvencoder");
  allocateCudaBuffer(&linear_image_buffer_gpu, server_image.getBufferSize(),
    "cuda buffer to accept linear images from CPU memory");
 }
 void ServerImageBuffer::releaseImageCompressionCudaBuffers() {
  CUresult cu_result = cuMemFree(nvencoder_input_buffer_gpu);
  if(cu_result != CUDA_SUCCESS) {
    "Error. cuMemFree failed for nvencoder_input_buffer_gpu with error code:" + std::to_string(cu_result);
  }
  nvencoder_input_buffer_gpu = 0;
  cu_result = cuMemFree(linear_image_buffer_gpu);
  if(cu_result != CUDA_SUCCESS) {
    "Error. cuMemFree failed for linear_image_buffer_gpu with error code:" + std::to_string(cu_result);
  }
  linear_image_buffer_gpu = 0;
 }
 void ServerImageBuffer::deleteNvEncoder() {
  nv_encoder_uptr.reset(nullptr);
 }
 #endif // WITH_CUDA
 void ServerImageBuffer::createStreamedImage(int index, bool scaled, DenoisingContext & denoising_context)
 {
-  ServerImage &bufferImage = imageBuffer[index];
+  ServerImage &server_image = imageBuffer[index];
-  StreamedImage &streamedImage = streamedImageBuffer[index];
+  StreamedImage &streamed_image = streamedImageBuffer[index];
-  streamedImage.initImage(bufferImage.camera);
+  streamed_image.initImage(server_image.camera);
-  bufferImage.denoised = false;
+  server_image.denoised = false;
  if(!scaled) {
-    normalizeImage(bufferImage);
+    normalizeImage(server_image);
  }
  float sampleCount = server_image.sampleCount;
 #if defined(OS_LINUX) && defined(WITH_CUDA)
  const bool use_nvencoder_for_compression = server_image.camera.master_image_compressor ==
    ClusterSessionParams::MASTER_IMAGE_COMPRESSOR_NVENCODER;
  if(use_nvencoder_for_compression) {
    if(!nv_encoder_uptr || server_image.camera.frame == 0) {
      resetNvEncoder(server_image);
    }
  }
 #else
  if(server_image.camera.master_image_compressor == ClusterSessionParams::MASTER_IMAGE_COMPRESSOR_NVENCODER) {
    throw std::runtime_error("ERROR. NVENCODER compressor is requested. Server is compiled without CUDA support\
 so has no nvencoder and can not encode images with nvencoder.\
 Recompile with CUDA or use JPEG compressor instead. Terminating.");
  }
 #endif
  if(server_image.camera.master_denoiser) {
    bool denoising_ok = denoiseImage(server_image, denoising_context);
    if(!denoising_ok) {
      throw std::runtime_error("Image denoising failed.");
    }
  }
-  if(bufferImage.camera.master_denoiser) {
+  MasterDenoiser * master_denoiser = denoising_context.getDenoiser(server_image.camera.master_denoiser);
-    denoiseImage(bufferImage, denoising_context);
+  const bool image_is_denoised_by_optix_denoiser = master_denoiser &&
    master_denoiser->getType() == ClusterSessionParams::MasterDenoiser::MASTER_DENOISER_OPTIX;
 #ifdef WITH_OPTIX
  MasterOptixDenoiser* optix_denoiser = nullptr;
  if(image_is_denoised_by_optix_denoiser) {
    optix_denoiser = dynamic_cast<MasterOptixDenoiser*>(master_denoiser);
  }
 #endif
  scoped_timer compression_timer;
 #if defined(OS_LINUX) && defined(WITH_CUDA)
  if(use_nvencoder_for_compression) {
  #ifdef WITH_OPTIX
    compressWithNvencoder(server_image, streamed_image, optix_denoiser);
  #else
    compressWithNvencoder(server_image, streamed_image);
  #endif
  } else {
  #ifdef WITH_OPTIX
    compressWithTurbojpeg(server_image, streamed_image, optix_denoiser);
  #else
    compressWithTurbojpeg(server_image, streamed_image);
  #endif
  }
 #else
  #ifdef WITH_OPTIX
    compressWithTurbojpeg(server_image, streamed_image, optix_denoiser);
  #else
    compressWithTurbojpeg(server_image, streamed_image);
  #endif
 #endif
  double compression_time = compression_timer.elapsed();
  LOG(INFO) << "compression time for frame: " << server_image.camera.frame << " " << compression_time <<
    " (includes pixel conversion time)";
  }
 #if defined(OS_LINUX) && defined(WITH_CUDA)
 #ifdef WITH_OPTIX
 void ServerImageBuffer::compressWithNvencoder(ServerImage &server_image, StreamedImage &streamed_image,
    MasterOptixDenoiser* optix_denoiser) {
 #else
 void ServerImageBuffer::compressWithNvencoder(ServerImage &server_image, StreamedImage &streamed_image) {
 #endif
  CUstream cuda_stream = nullptr;
  CUdeviceptr image_to_compress_gpu_ptr = 0;
 #ifdef WITH_OPTIX
  if(optix_denoiser) {
    cuda_stream = optix_denoiser->getCudaStream();
    image_to_compress_gpu_ptr = optix_denoiser->getCudaMemPointerToDenoisedImage();
  } else
 #endif
  {
    copyServerImageToGpuMemory(server_image, linear_image_buffer_gpu);
    image_to_compress_gpu_ptr = linear_image_buffer_gpu;
  }
  const bool useSrgbColorSpace =
    master_image_color_format == ClusterSessionParams::MasterImageColorFormat::MASTER_IMAGE_COLOR_FORMAT_SRGB;
  gpuRawRgbToGammaCorrectedYuvNv12(image_to_compress_gpu_ptr, server_image.getWidth(),
    server_image.getHeight(), useSrgbColorSpace, cuda_stream, nvencoder_input_buffer_gpu);
  nv_encoder_uptr->encode(nvencoder_input_buffer_gpu, streamed_image.getCompressedImage());
 }
 #endif // WITH_CUDA
 #ifdef WITH_OPTIX
 void ServerImageBuffer::compressWithTurbojpeg(ServerImage &server_image, StreamedImage &streamed_image,
    MasterOptixDenoiser* optix_denoiser) {
 #else
 void ServerImageBuffer::compressWithTurbojpeg(ServerImage &server_image, StreamedImage &streamed_image) {
 #endif
 #ifdef WITH_OPTIX
  if(optix_denoiser) {
    optix_denoiser->copyDenoisedImageFromCudaToHostMemory(
      (uint8_t*)server_image.denoised_pixels);
    server_image.denoised = true;
  }
 #endif
  parallel_convert(server_image, streamed_image.getByteBuffer());
  uint8_t *jpeg_data = NULL;
  size_t buf_size = turbojpeg_compressor_uptr->compress(streamed_image.getByteBuffer().data(),
    server_image.getWidth(), server_image.getHeight(),
    server_image.camera.compression_quality,
    jpeg_data);
  if (jpeg_data) {
    streamed_image.copyInCompressedImage(jpeg_data, buf_size);
    turbojpeg_compressor_uptr->free(jpeg_data);
  }
  scoped_timer pixel_convert_timer;
  parallel_convert(bufferImage, streamedImage.getByteBuffer());
  LOG(INFO) << "pixel conversion time for frame: " << bufferImage.camera.frame << " " << pixel_convert_timer.elapsed();
 }
 bool ServerImageBuffer::denoiseImage(ServerImage &bufferImage, DenoisingContext & denoising_context) {
@ -162,7 +400,22 @@ bool ServerImageBuffer::denoiseImage(ServerImage &bufferImage, DenoisingContext
  if(result == DenoisingResult::OK) {
    return true;
  }
-
+#ifdef WITH_OPTIX
  else if (result == DenoisingResult::IMAGE_TOO_BIG &&
      master_denoiser->getType() == ClusterSessionParams::MasterDenoiser::MASTER_DENOISER_OPTIX) {
    bool replacement_ok = denoising_context.replaceOptixDenoiser(bufferImage.getWidth(), bufferImage.getHeight());
    if(replacement_ok) {
      master_denoiser = denoising_context.getDenoiser(bufferImage.camera.master_denoiser);
      if(master_denoiser) {
        return master_denoiser->denoise(bufferImage) == DenoisingResult::OK;
      } else {
        LOG(WARNING) << "WARNING. No denoiser instance of requested type: " << bufferImage.camera.master_denoiser;
      }
    } else {
      LOG(ERROR) << "ERROR. replaceOptixDenoiser failed, image will not be denoised";
    }
  }
 #endif
  return false;
 }
@ -209,19 +462,43 @@ int ServerImageBuffer::add_streamed_image(RPCReceive &rcv,
  }
  if (!isFull() ) {
    int frameIndex = writeIndex;
-    StreamedImage &writeImage = streamedImageBuffer[frameIndex];
+    StreamedImage &streamed_image = streamedImageBuffer[frameIndex];
-    writeImage.read(rcv);
+    streamed_image.read(rcv);
    scoped_timer decompress_timer;
 #if defined(OS_LINUX) && defined(WITH_CUDA)
    const bool use_nvdecoder_for_decompression = streamed_image.getNetCamera().master_image_compressor ==
      ClusterSessionParams::MASTER_IMAGE_COMPRESSOR_NVENCODER;
    if(use_nvdecoder_for_decompression) {
      if(!nv_decoder_uptr || streamed_image.getFrame() == 0) {
        VLOG(3) << "Creating nvdecoder";
          CUcontext cuda_context = CudaContextProvider::getPrimaryContext(CUDA_DEVICE_NUM);
          const CUDAContextScope scope(cuda_context);
          nv_decoder_uptr.reset(new NvDecoder(cuda_context));
      }
      nv_decoder_uptr->decode(streamed_image.getCompressedImage(),
        streamed_image.getFrame(), &streamed_image.getRgbImageBuffer());
    } else
 #endif
    {
      int width = 0, height = 0;
      streamed_image.getImage(width, height);
      if (!turbojpeg_compressor_uptr->decompress(streamed_image.getCompressedImage(), width, height,
          streamed_image.getRgbImageBuffer())) {
        LOG(ERROR) << "jpeg decompression failed";
      }
    }
    LOG(INFO) << "decompress image for frame: " << streamed_image.getFrame() << " " << decompress_timer.elapsed();
    if (save_streamed_image) {
-      if (!writeImage.saveImage(output_folder_path, "client_streamed_image_rgb_", frame_id)) {
+      if (!streamed_image.saveImage(output_folder_path, "client_streamed_image_rgb_", frame_id)) {
        LOG(ERROR) << "failed to save streamed image";
      }
    }
    incWriteIndex();
-    LOG(INFO) << "thin client insert master image frame: "<< writeImage.getFrame();
+    LOG(INFO) << "thin client insert master image frame: "<< streamed_image.getFrame();
    return frameIndex;
  } else {
-    StreamedImage writeImage;
+    StreamedImage streamed_image;
-    writeImage.read(rcv);
+    streamed_image.read(rcv);
    return -1;
  }
 }
--- a/intern/cycles/cluster_rendering/libcluster/server_image_buffer.h
+++ b/intern/cycles/cluster_rendering/libcluster/server_image_buffer.h
@ -11,16 +11,33 @@
 #define IMAGE_FRAME_COUNT 10
 #define PIXEL_THREAD_COUNT 60
 // Not sure that this is the best place to set OS_LINUX
 // IMO it should be set somewhere on higher level
 // setting it here for now to get system built.
 // TODO: revisit this flag later
 #if defined(linux) || defined(__linux) || defined(__linux__)
 # ifndef OS_LINUX
 #  define OS_LINUX
 # endif
 #endif
 namespace cgr_libcluster {
 using std::string;
 class Camera;
 class MasterOptixDenoiser;
 class NetCamera;
 #if defined(OS_LINUX) && defined(WITH_CUDA)
 class NvEncoder;
 class NvDecoder;
 #endif
 class RPCSend;
 class RPCReceive;
 class PathTraceDisplay;
 class ServerImage;
 class DenoisingContext;
-
+class TurbojpegCompressor;
 class ServerImageBuffer {
 public:
@ -32,20 +49,16 @@ public:
  bool isEmpty();
  bool isFull();
  ServerImage& getReadImage();
  ServerImage* getActiveImage();
  ServerImage* beginInsertImage(NetCamera&);
  void endInsertImage();
  bool begin_frame(NetCamera&);
  int add_frame(RPCReceive& rcv, std::atomic<bool>& stopped);
  int add_streamed_image(RPCReceive& rcv, std::atomic<bool>& stopped,
                         bool save_streamed_image, const std::string &output_folder_path, int frame_id);
  bool wait_for_streamed_image(std::atomic<bool>& stopped );
  const StreamedImage* get_streamed_image();
-
+  void set_output_folder_path(const std::string & output_folder_path);
  void reset();
@ -57,6 +70,9 @@ public:
  void set_master_image_color_format(ClusterSessionParams::MasterImageColorFormat master_image_color_format_in) {
    master_image_color_format = master_image_color_format_in;
  }
 #if defined(OS_LINUX) && defined(WITH_CUDA)
  void deleteNvEncoder();
 #endif
  //only accessed by insertion thread
@ -72,6 +88,9 @@ public:
  tbb::mutex serverImageStopMutex;
 private:
  static const int CUDA_DEVICE_NUM = 0;
  static const string VIDEO_FULL_FILE_NAME;
  bool denoiseImage(ServerImage &bufferImage, DenoisingContext & denoising_context);
  // In the master-workers configuration (which is target production setup) images are normalised
  // upon arriving at the master. It's done in different threads what benefits from the
@ -82,6 +101,55 @@ private:
  void normalizeImage(ServerImage &server_image);
  ClusterSessionParams::MasterImageColorFormat master_image_color_format =
    ClusterSessionParams::DEFAULT_MASTER_IMAGE_COLOR_FORMAT;
 #ifdef WITH_CUDA
  void allocateCudaBuffer(long long unsigned int * buffer_ptr, size_t buffer_size,
    const std::string & buffer_name);
  void allocateImageCompressionCudaBuffers(const ServerImage &server_image);
  void releaseImageCompressionCudaBuffers();
  void resetNvEncoder(const ServerImage &server_image);
  void copyServerImageToGpuMemory(const ServerImage &server_image, long long unsigned int  &linear_image_buffer_gpu);
 #endif // WITH_CUDA
  void writeImageToVideoStreamFile(const std::vector<uint8_t> &encoded_image) const;
 #ifdef WITH_CUDA
  #ifdef WITH_OPTIX
    void compressWithNvencoder(ServerImage &server_image, StreamedImage &streamed_image, MasterOptixDenoiser* optix_denoiser);
    void compressWithTurbojpeg(ServerImage &server_image, StreamedImage &streamed_image, MasterOptixDenoiser* optix_denoiser);
  #else
    void compressWithNvencoder(ServerImage &server_image, StreamedImage &streamed_image);
    void compressWithTurbojpeg(ServerImage &server_image, StreamedImage &streamed_image);
  #endif // WITH_OPTIX
 #else
  void compressWithTurbojpeg(ServerImage &server_image, StreamedImage &streamed_image);
 #endif // WITH_CUDA
  std::string output_folder_path;
  std::string encoded_videostream_filename;
  bool image_buffers_allocated = false;
  // Use long long unsigned int instead of CUdeviceptr since cuda headers are not included
  // in this header file due to server_image_buffer.h is included from the cycles/session/buffers.cpp
  // via libcluster/net_server.h but cycles/session unaware about cuda so compilation fails.
  // To minimize changes to the cycles/session and keep it cuda independent cuda headers are
  // included in the server_image_buffer.cpp
  // We may come up with better incapsulation and libcluster interface towards the Blender code.
  long long unsigned int nvencoder_input_buffer_gpu = 0;
  // When rendered image is denoised by OIDN denoiser or is not denoised at all
  // it's hosted in the CPU memory.
  // In these cases we copy such images into this CUDA memory when nvencoder is used for the image compression
  // so we can do gamma correction and rgb to yuv conversion before passing the image to the nvencoder.
  long long unsigned int linear_image_buffer_gpu = 0;
  // image compressors
 #if defined(OS_LINUX) && defined(WITH_CUDA)
  std::unique_ptr<NvEncoder> nv_encoder_uptr;
  std::unique_ptr<NvDecoder> nv_decoder_uptr;
 #endif
  std::unique_ptr<TurbojpegCompressor> turbojpeg_compressor_uptr;
 };
 } // cgr_libcluster
--- a/intern/cycles/cluster_rendering/libcluster/streamed_image.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/streamed_image.cpp
@ -1,15 +1,41 @@
 #include "image_io_util.h"
 #include <turbojpeg.h>    // for tjFree
 #include "./utils/timer.h"    // for scoped_timer
 #include "./utils/logging.h"
 #include "streamed_image.h"
 #include "net_simple.h"
 #include "compression/turbojpeg_compressor.h"
 namespace cgr_libcluster {
 using OpenImageIO_v2_4::ImageOutput;
 using OpenImageIO_v2_4::TypeDesc;
 StreamedImage::StreamedImage() : allocated(false), w(0), h(0),
    jpeg_compressor_uptr(new TurbojpegCompressor()) {
 }
 StreamedImage::~StreamedImage(){
 }
 StreamedImage::StreamedImage(const StreamedImage& ) :
    jpeg_compressor_uptr(new TurbojpegCompressor()) {
 }
 StreamedImage::StreamedImage( StreamedImage&& ) :
    jpeg_compressor_uptr(new TurbojpegCompressor()) {
 }
 StreamedImage& StreamedImage::operator=(const StreamedImage& ) {
  jpeg_compressor_uptr.reset(new TurbojpegCompressor());
  return *this;
 }
 StreamedImage& StreamedImage::operator=(StreamedImage&& ) {
  jpeg_compressor_uptr.reset(new TurbojpegCompressor());
  return *this;
 }
 size_t StreamedImage::read(RPCReceive &rcv)
 {
  size_t buf_size = 0;
@ -18,163 +44,47 @@ size_t StreamedImage::read(RPCReceive &rcv)
  NetCamera net_camera;
  rcv.read_buffer(&net_camera, sizeof(NetCamera));
  // log line below is used by get_metrics.py to calculate stats. If you change it
  // please make sure get_metrics.py still works correctly. Update if needed.
  LOG(INFO) << "net camera received for frame: " << net_camera.frame << " " << net_camera.cam_width << " " << net_camera.cam_height;
  initImage(net_camera);
-  std::vector<uint8_t> cbuffer(buf_size);
+  if(compressed_image.size() != buf_size) {
-  rcv.read_buffer(reinterpret_cast<char *>(cbuffer.data()), buf_size);
+    compressed_image.resize(buf_size);
  LOG(INFO) << "read image for frame: " << camera.frame << " with sample: " << camera.sampleCount << " size: " << buf_size;
  scoped_timer dec_timer;
  if (!decompress(cbuffer)) {
    buf_size = 0;
  }
-  LOG(INFO) << "decompress image for frame: " << camera.frame << " " << dec_timer.elapsed();
+  rcv.read_buffer(reinterpret_cast<char *>(compressed_image.data()), buf_size);
  LOG(INFO) << "read image for frame: " << camera.frame << " with sample: " << camera.sampleCount << " size: " << buf_size;
  return buf_size;
 }
 size_t StreamedImage::write(RPCSend &snd)
 {
-  scoped_timer compression_timer;
+  const size_t compressed_image_size = compressed_image.size();
-  uint8_t *jpeg_data = NULL;
+  if (compressed_image_size > 0) {
-  size_t buf_size = compress(jpeg_data);
+    // get_metrics.py script depends on formatting of this log line, if you change it,
  LOG(INFO) << "compression time for frame: " << camera.frame << " " << compression_timer.elapsed();
  if (buf_size > 0) {
    // get_metrics.py script depends on formatting of this log line, it you change it,
    // please make sure that script still works or update it accordingly
-    LOG(INFO) << "stream image for frame: " << camera.frame << " with sample " << camera.sampleCount << " quality: " << camera.compression_quality << " buffer size " << buf_size + sizeof(buf_size) + sizeof(NetCamera);
+    LOG(INFO) << "stream image for frame: " << camera.frame << " with sample " << camera.sampleCount << " quality: " <<
      camera.compression_quality << " buffer size " << compressed_image_size + sizeof(compressed_image_size) + sizeof(NetCamera);
    snd.write();
-    snd.write_buffer(&buf_size, sizeof(buf_size));
+    snd.write_buffer(&compressed_image_size, sizeof(compressed_image_size));
    snd.write_buffer(&camera, sizeof (NetCamera) );
-    snd.write_buffer(jpeg_data, buf_size);
+    snd.write_buffer(compressed_image.data(), compressed_image_size);
  }
-  if (jpeg_data) tjFree(jpeg_data);
+  return compressed_image_size;
  return buf_size;
 }
-#ifdef WITH_WEBRTC
+void StreamedImage::copyInCompressedImage(const uint8_t * compressed_image_ptr, const size_t size_in_bytes) {
-size_t StreamedImage::write(cgr_streaming::WebrtcPeer *webrtcPeer)
+  // for uint8_t the number_of_elements is the same as size_in_bytes
-{
+  const size_t number_of_elements = size_in_bytes;
-  if (webrtcPeer == nullptr) {
+  compressed_image.assign(compressed_image_ptr, compressed_image_ptr + number_of_elements);
    LOG(ERROR) << "webrtcPeer is nullptr";
    return 0;
  }
  size_t buf_size = getBufferSize();
  if (buf_size > 0) {
    // TODO(fangy): Might need to work with pmishchuk@ to update the LOG line below so get_metrics.py works when WebRTC is used.
    // get_metrics.py script depends on formatting of this log line, it you change it,
    // please make sure that script still works or update it accordingly
    LOG(INFO) << "stream image for frame: " << camera.frame << " with sample " << camera.sampleCount << " quality: " << camera.compression_quality << " buffer size " << buf_size + sizeof(buf_size) + sizeof(NetCamera) << " using WebRTC";
    int width = 0, height = 0;
    void *image_buffer = getImage(width, height);
    LOG(INFO) << "WebRTC sendFrame id " << getFrame() << " width " << width << " height " << height;
    scoped_timer send_frame_timer;
    cgr_streaming::WebrtcFrame frame(getFrame(), static_cast<const uint8_t*>(image_buffer), width, height);
    if (!webrtcPeer->sendFrame(frame)) {
      LOG(ERROR) << "failed to send frame via WebRTC";
    }
    LOG(INFO) << "send frame time for frame number: " << camera.frame << " " << send_frame_timer.elapsed();
  }
  return buf_size;
 }
 #endif
 //#define TIME_JPEG
 size_t StreamedImage::compress(unsigned char* &jpeg_image)
 {
    // Convert buffer to unsigned char * 3 channels
    const int subsampling = TJSAMP_444;
    size_t jpeg_length = 0; // tjCompress2 will allocate the jpeg_image buffer
    jpeg_image = nullptr;
 #ifdef TIME_JPEG
    struct timespec start_time, end_time;
    clock_gettime(CLOCK_MONOTONIC, &start_time);
 #endif
    tjhandle jpeg_compressor = tjInitCompress();
    if (jpeg_compressor == nullptr) {
      LOG(ERROR) << "Cannot initialize JPEG compressor";
      return 0;
    }
    void* src_buffer = byte_buffer.data();
    int jpeg_error = tjCompress2(jpeg_compressor,
                                 (unsigned char*) src_buffer,
                                 w,
                                 0,
                                 h,
                                 TJPF_RGB,
                                 &jpeg_image,
                                 (unsigned long *)&jpeg_length,
                                 subsampling,
                                 camera.compression_quality,
                                 TJFLAG_FASTDCT);
    tjDestroy(jpeg_compressor);
    if (jpeg_error < 0) {
      const char *jpeg_error_str = tjGetErrorStr();
      LOG(ERROR) << "JPEG compression error: " << jpeg_error_str;
      return 0;
    }
 #ifdef TIME_JPEG
    clock_gettime(CLOCK_MONOTONIC, &end_time);
    // ms time
    double elapsed_time = (end_time.tv_nsec - start_time.tv_nsec) / 1e6;
    LOG(INFO) << "TIMING: JPEG compression: " << elapsed_time << "ms"
        << ", resolution " << w << "x" << h
        << ", sizes " << src_buffer.size() << " (" << jpeg_length << ")";
 #endif
    return jpeg_length;
 }
-bool StreamedImage::decompress(std::vector<uint8_t> &cbuffer)
+std::vector<uint8_t>& StreamedImage::getCompressedImage() {
-{
+  return compressed_image;
-#ifdef TIME_JPEG
+}
    struct timespec start_time, end_time;
    clock_gettime(CLOCK_MONOTONIC, &start_time);
 #endif
    /* Use TurboJPEG to decompress the buffer */
    int subsampling = 0;
    tjhandle jpeg_decompressor = tjInitDecompress();
    if (jpeg_decompressor == nullptr) {
      LOG(ERROR) << "Cannot initialize JPEG decompressor";
      return false;
    }
    int jpeg_error = tjDecompressHeader2(jpeg_decompressor, cbuffer.data(),
            cbuffer.size(), &w, &h, &subsampling);
    if (jpeg_error < 0) {
      LOG(ERROR) << "Cannot decode JPEG header from StreamedImage";
      tjDestroy(jpeg_decompressor);
      return false;
    }
-    std::vector<unsigned char> dst_buffer(w * h * 3);
+std::vector<cgr_libcluster::uchar3>& StreamedImage::getByteBuffer() {
-    //void *dst_buffer = byte_buffer.data();
+  return byte_buffer;
    jpeg_error = tjDecompress2(jpeg_decompressor,
                               cbuffer.data(),
                               cbuffer.size(),
                               (unsigned char*) dst_buffer.data(),
                               w,
                               0,
                               h,
                               TJPF_RGB,
                               TJFLAG_ACCURATEDCT);
    tjDestroy(jpeg_decompressor);
    if (jpeg_error < 0) {
      const char *jpeg_error_str = tjGetErrorStr();
      LOG(ERROR) << "JPEG decompression error" << jpeg_error_str;
      return false;
    }
    putImage(w, h, dst_buffer.data());
 #ifdef TIME_JPEG
    clock_gettime(CLOCK_MONOTONIC, &end_time);
    // ms time
    double elapsed_time = (end_time.tv_nsec - start_time.tv_nsec) / 1e6;
    LOG(INFO) << "TIMING: JPEG decompression: " << elapsed_time << "ms"
        << ", resolution " << w << "x" << h
        << ", sizes " << cbuffer.size() << " (" << dst_buffer.size() << ")";
 #endif
    return true;
 }
 bool StreamedImage::saveImage(const std::string &output_folder_path, const std::string &file_name_prefix, int frame_id) {
@ -183,4 +93,12 @@ bool StreamedImage::saveImage(const std::string &output_folder_path, const std::
  return ImageIOUtil::saveFrame(file_path, TypeDesc::UCHAR, image_output.get(),getByteBuffer().data(), w, h);
 }
 std::vector<cgr_libcluster::uchar3>& StreamedImage::getRgbImageBuffer() {
  return byte_buffer;
 }
 NetCamera& StreamedImage::getNetCamera() {
  return camera;
 }
 } // cgr_libcluster
--- a/intern/cycles/cluster_rendering/libcluster/streamed_image.h
+++ b/intern/cycles/cluster_rendering/libcluster/streamed_image.h
@ -10,29 +10,26 @@ namespace cgr_libcluster {
 class PathTraceDisplay;
 class RPCSend;
 class RPCReceive;
 class TurbojpegCompressor;
 class StreamedImage {
 public:
-  StreamedImage() :  allocated(false), w(0), h(0) {}
+  StreamedImage();
-  ~StreamedImage() {}
+  ~StreamedImage();
  void putImage(int width, int height, const void *image)
  {
    if (w < width || h  < height) {
      w = width;
      h = height;
      byte_buffer.resize(w*h);
    }
    w = width;
    h = height;
    memcpy(byte_buffer.data(), image, sizeof(cgr_libcluster::uchar3) * width * height);
  }
  StreamedImage(const StreamedImage& );
  StreamedImage(StreamedImage&& );
  StreamedImage& operator=(const StreamedImage& );
  StreamedImage& operator=(StreamedImage&& );
 public:
  const void* getImage(int &width, int &height) const
  {
    width = w;
    height = h;
    return byte_buffer.data();
  }
  void initImage(NetCamera &cam) {
      camera = cam;
      if (allocated) {
@ -48,26 +45,28 @@ class StreamedImage {
  size_t getBufferSize() const { return w * h *  sizeof(cgr_libcluster::uchar3); }
  size_t read(RPCReceive &rcv);
  size_t write(RPCSend &snd);
 #ifdef WITH_WEBRTC
  size_t write(cgr_streaming::WebrtcPeer *webrtcPeer);
 #endif
  int getFrame() const { return camera.frame; }
  // Compress the StreamedImage into a jpeg stream stored in cbuffer
  size_t compress(unsigned char*&) ;
  // Decompress a jpeg stream cbuffer into the StreamedImage
  bool decompress(std::vector<uint8_t> &cbuffer);
  bool saveImage(const std::string &output_folder_path, const std::string &file_name_prefix, int frame_id);
-  std::vector<cgr_libcluster::uchar3>& getByteBuffer() {return byte_buffer; }
+  void copyInCompressedImage(const uint8_t * compressed_image_ptr, const size_t size_in_bytes);
  std::vector<cgr_libcluster::uchar3>& getByteBuffer();
  std::vector<uint8_t>& getCompressedImage();
  std::vector<cgr_libcluster::uchar3>& getRgbImageBuffer();
  NetCamera& getNetCamera();
 private:
  bool allocated;
  int w;
  int h;
-  std::vector<cgr_libcluster::uchar3> byte_buffer;
+  std::vector<cgr_libcluster::uchar3> byte_buffer; // raw image as rgb
  std::vector<uint8_t> compressed_image;
  NetCamera camera;
  // Have TurbojpegCompressor as pointer here so we can use forward declaration
  // and avoid including TurboJpeg header in this header and minimize changes in the Blender code
  // which includes this header but is not aware about TurboJpeg
  std::unique_ptr<TurbojpegCompressor> jpeg_compressor_uptr;
 };
 } // cgr_libcluster
--- a/intern/cycles/cluster_rendering/libcluster/test/CMakeLists.txt
+++ b/intern/cycles/cluster_rendering/libcluster/test/CMakeLists.txt
@ -34,11 +34,13 @@ set(LIBRARIES
 add_definitions(-DWITH_CYCLES_LOGGING)
-if(WITH_CUDA_DYNLOAD)
+if(WITH_CYCLES_DEVICE_CUDA)
-  list(APPEND LIBRARIES extern_cuew)
+  if(WITH_CUDA_DYNLOAD)
-  add_definitions(-DWITH_CUDA_DYNLOAD)
+    list(APPEND LIBRARIES extern_cuew)
-else()
+    add_definitions(-DWITH_CUDA_DYNLOAD)
-  list(APPEND LIBRARIES ${CUDA_CUDA_LIBRARY})
+  else()
    list(APPEND LIBRARIES ${CUDA_CUDA_LIBRARY})
  endif()
 endif()
 if(WITH_CYCLES_LOGGING)
--- a/intern/cycles/cluster_rendering/libcluster/test/denoising_context_test.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/test/denoising_context_test.cpp
@ -1,7 +1,9 @@
 #include "testing/testing.h"
 #include "gmock/gmock.h"
 #ifdef WITH_OPTIX
 #include <optix_stubs.h>
 #include <optix_function_table_definition.h>
 #endif
 #include "mocks.h"
 #include "denoising/denoising_context.h"
--- a/intern/cycles/cluster_rendering/libcluster/utils/cuda_utils.h
+++ b/intern/cycles/cluster_rendering/libcluster/utils/cuda_utils.h
@ -0,0 +1,46 @@
 #ifndef __CUDA_UTILS_H__
 #define __CUDA_UTILS_H__
 namespace cgr_libcluster {
 #define THROW_IF_ERROR true
 #define DO_NOT_THROW false
 #define CUDA_API_CALL(cuda_api, throw_if_error)                                                                      \
  do {                                                                                                               \
    CUresult cu_result = cuda_api;                                                                                   \
    if (cu_result != CUDA_SUCCESS) {                                                                                 \
      const char *error_name = NULL;                                                                                 \
      cuGetErrorName(cu_result, &error_name);                                                                        \
      std::string message = std::string("ERROR. ") + #cuda_api + " failed with error: " + std::string(error_name);   \
      LOG(ERROR) << message ;                                                                                        \
      if (throw_if_error) {                                                                                          \
        throw std::runtime_error(message);                                                                           \
      }                                                                                                              \
    }                                                                                                                \
  } while (0)
 #define OPTIX_API_CALL(optix_api, throw_if_error)                                                                    \
  do {                                                                                                               \
    OptixResult optix_result = optix_api;                                                                            \
    if(optix_result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {                                                        \
      std::string message = std::string("ERROR. ") + #optix_api + " failed due to installed driver "                 \
      "does not support ABI version: "  + std::to_string(OPTIX_ABI_VERSION);                                         \
      LOG(ERROR) << message ;                                                                                        \
      if(throw_if_error) {                                                                                           \
        throw std::runtime_error(message);                                                                           \
      }                                                                                                              \
    } else if(optix_result != OPTIX_SUCCESS) {                                                                       \
      std::string message = std::string("ERROR. ") + #optix_api + " failed with error: " +                           \
        std::to_string(optix_result);                                                                                \
      LOG(ERROR) << message ;                                                                                        \
      if(throw_if_error) {                                                                                           \
        throw std::runtime_error(message);                                                                           \
      }                                                                                                              \
    }                                                                                                                \
  } while (0)
 } // end of namespace cgr_libcluster
 #endif
--- a/intern/cycles/cluster_rendering/libcluster/utils/image.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/utils/image.cpp
@ -0,0 +1,41 @@
 #include<vector>
 #include "vector_types.h"    // for uchar3
 #include "image.h"
 namespace cgr_libcluster {
 #define PIXEL_THREAD_COUNT 60
 int clip(int n, int lower, int upper) {
  return std::max(lower, std::min(n, upper));
 }
 void yuv2Rgb(uint8_t *yuv_image, int width, int height,
    std::vector<cgr_libcluster::uchar3> * rgb_image) {
  const int num_pixels = width * height;
  for(int i = 0; i < num_pixels; ++i) {
    const int pixel_row = i/width;
    const int pixel_column = i - (width * pixel_row);
    const int uv_row = pixel_row / 2;
    const int uv_column = pixel_column / 2;
    const int u_i = num_pixels + uv_row * width + uv_column*2;
    const int v_i = u_i + 1;
    const int y = yuv_image[i];
    const int u = yuv_image[u_i];
    const int v = yuv_image[v_i];
    const int c = y - 16;
    const int d = u - 128;
    const int e = v - 128;
    const uint8_t r = clip((298*c + 409*e + 128) >> 8, 0, 255);
    const uint8_t g = clip((298*c - 100*d - 208*e + 128) >> 8, 0, 255);
    const uint8_t b = clip((298*c + 516*d + 128) >> 8, 0, 255);
    uchar3 & rgb_pixel = (*rgb_image)[i];
    rgb_pixel.x = r;
    rgb_pixel.y = g;
    rgb_pixel.z = b;
  }
 }
 }
--- a/intern/cycles/cluster_rendering/libcluster/utils/image.h
+++ b/intern/cycles/cluster_rendering/libcluster/utils/image.h
@ -0,0 +1,12 @@
 #ifndef __IMAGE_H__
 #define __IMAGE_H__
 namespace cgr_libcluster {
 // This function expects that rgb_image is not null and is properly sized by caller to accomodate all pixels
 void yuv2Rgb(uint8_t *yuv_image, int width, int height,
    std::vector<cgr_libcluster::uchar3> * rgb_image);
 }
 #endif 
--- a/intern/cycles/cluster_rendering/libcluster_cuda_kernels/CMakeLists.txt
+++ b/intern/cycles/cluster_rendering/libcluster_cuda_kernels/CMakeLists.txt
@ -0,0 +1,8 @@
 file(GLOB SRC *.cpp *.cu)
 if(NOT WITH_CYCLES_CUDA_BINARIES)
  find_package(CUDA)
 endif()
 cuda_add_library(cycles_libcluster_cuda_kernels "${LIB}" ${SRC} ${SRC_HEADERS})
--- a/intern/cycles/cluster_rendering/libcluster_cuda_kernels/gpu_image_utils.cu
+++ b/intern/cycles/cluster_rendering/libcluster_cuda_kernels/gpu_image_utils.cu
@ -0,0 +1,91 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <cuda.h>
 #include "gpu_image_utils.h"
 constexpr int NUM_COLOR_CHANNELS = 3;
 constexpr int NUM_THREADS = 10240;
 constexpr int NUM_THREADS_PER_BLOCK = 512;
 template<typename T>
 struct PixelRgb {
  T r;
  T g;
  T b;
 };
 __device__
 float color_linear_to_srgb(float c) {
  if (c < 0.0031308f)
    return (c < 0.0f) ? 0.0f : c * 12.92f;
  else
    return 1.055f * powf(c, 1.0f / 2.4f) - 0.055f;
 }
 // Simple RGB to YUV NV12 (4:2:0 12 bpp) conversion based on a description in this doc:
 // https://learn.microsoft.com/en-us/windows/win32/medfound/recommended-8-bit-yuv-formats-for-video-rendering
 // It only supports resolutions of even numbers to keep it simple. Supporting resolutions with odd numbers
 // adds extra complexity to support edge cases what is unnecessary commplication for not required
 // usecases since we only need to support standard resolutions like
 // 720p, 1080p, 1024x1024, 2048x2048 etc. which are all even numbers.
 // It looks like RGB to YUV conversion is available in the NVIDIA Performance Primitives (npp)
 // https://developer.nvidia.com/npp
 // but we currently do not use npp so I won't introduce this dependency just for single method.
 // If there is a standard or better GPU implementation of RGB-to-YUV conversion available we can switch to it.
 __global__
 void gpuRawRgbToGammaCorrectedYuvNv12Impl(CUdeviceptr cu_image_in_ptr, int width, int height,
    bool useSrgbColorSpace, CUdeviceptr cu_image_yuv_nv12_out_ptr) {
  const int num_pixels = width * height;
  const int num_threads_per_block = blockDim.x;
  const int num_blocks_in_grid = gridDim.x;
  const int total_num_of_threads = num_threads_per_block * num_blocks_in_grid;
  const int num_pixels_to_process_in_thread = (num_pixels + total_num_of_threads) / total_num_of_threads;
  const int start_i = blockIdx.x * num_threads_per_block * num_pixels_to_process_in_thread +
    threadIdx.x * num_pixels_to_process_in_thread;
  const int end_i = min(start_i + num_pixels_to_process_in_thread, num_pixels);
  const PixelRgb<float> * cu_image_in_rgb_float_ptr = (PixelRgb<float>*)cu_image_in_ptr;
  uint8_t* cu_image_yuv_nv12_out_uint_ptr = (uint8_t*)cu_image_yuv_nv12_out_ptr;
  float r_f, g_f, b_f;
  uint8_t r, g, b;
  for(int i = start_i; i < end_i; ++i) {
    // Gamma correction
    const PixelRgb<float> & raw_pixel = cu_image_in_rgb_float_ptr[i];
    if(useSrgbColorSpace) {
      r_f = color_linear_to_srgb(raw_pixel.r);
      g_f = color_linear_to_srgb(raw_pixel.g);
      b_f = color_linear_to_srgb(raw_pixel.b);
    } else {
      r_f = raw_pixel.r;
      g_f = raw_pixel.g;
      b_f = raw_pixel.b;
    }
    r = (uint8_t)(__saturatef(r_f) * 255.0f + 0.5f);
    g = (uint8_t)(__saturatef(g_f) * 255.0f + 0.5f);
    b = (uint8_t)(__saturatef(b_f) * 255.0f + 0.5f);
    // Convert sRGB to YUV
    const int pixel_row = i/width;;
    const int num_pixels_above_the_current_row = pixel_row * width;
    const int pixel_column = i - num_pixels_above_the_current_row;
    const uint8_t y = (( 66 * r + 129 * g + 25 * b + 128) >> 8) +  16;
    cu_image_yuv_nv12_out_uint_ptr[i] = y;
    if(pixel_column % 2 == 0 && pixel_row % 2 == 0) {
      const uint8_t u = ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
      const uint8_t v = ((112 * r - 94 * g -  18 * b + 128) >> 8) + 128;
      const int u_i = num_pixels + i - (pixel_row/2 * width);
      cu_image_yuv_nv12_out_uint_ptr[u_i] = u;
      cu_image_yuv_nv12_out_uint_ptr[u_i + 1] = v;
    }
  }
 }
 void gpuRawRgbToGammaCorrectedYuvNv12(CUdeviceptr cu_image_in_ptr, int width, int height,
    bool useSrgbColorSpace, CUstream cuda_stream, CUdeviceptr cu_image_yuv_nv12_out_ptr) {
  const int num_blocks = NUM_THREADS / NUM_THREADS_PER_BLOCK;
  const size_t size_of_dynamically_allocated_shared_memory = 0;
  gpuRawRgbToGammaCorrectedYuvNv12Impl<<<num_blocks, NUM_THREADS_PER_BLOCK,
      size_of_dynamically_allocated_shared_memory, cuda_stream>>>(
    cu_image_in_ptr, width, height, useSrgbColorSpace, cu_image_yuv_nv12_out_ptr);
 }
--- a/intern/cycles/cluster_rendering/libcluster_cuda_kernels/gpu_image_utils.h
+++ b/intern/cycles/cluster_rendering/libcluster_cuda_kernels/gpu_image_utils.h
@ -0,0 +1,7 @@
 #ifndef __IMAGE_UTILS_H__
 #define __IMAGE_UTILS_H__
 void gpuRawRgbToGammaCorrectedYuvNv12(CUdeviceptr cu_image_in_ptr, int width, int height,
  bool useSrgbColorSpace, CUstream cuda_stream, CUdeviceptr cu_image_yuv_nv12_out_ptr);
 #endif
--- a/intern/cycles/cluster_rendering/profiling/get_metrics.py
+++ b/intern/cycles/cluster_rendering/profiling/get_metrics.py
@ -979,8 +979,8 @@ def get_resolution_stats(config):
                    print("Can not get resolution from the log line, it could be corrupted. Log line:\n" + log_line, file = sys.stderr)
                    log_line = log_file.readline()
                    continue
-                width = log_items[-3]
+                width = log_items[-2]
-                height = log_items[-2]
+                height = log_items[-1]
                resolution = width + "x" + height
                if current_resolution_stats is None or current_resolution_stats.name != resolution:
                    current_resolution_stats = MetricStats(resolution)
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@ -103,12 +103,16 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  }
  /* Create context. */
-  result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+
 // FRL_CGR BEGIN
  // result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
  result = cuDevicePrimaryCtxRetain(&cuContext, cuDevice);
  if (result != CUDA_SUCCESS) {
    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
    return;
  }
 // FRL_CGR END
  int major, minor;
  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@ -116,14 +120,20 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  cuDevArchitecture = major * 100 + minor * 10;
  /* Pop context set by cuCtxCreate. */
-  cuCtxPopCurrent(NULL);
+// FRL_CGR BEGIN
 //  with cuDevicePrimaryCtxRetain do not need to pop current
 //  cuCtxPopCurrent(NULL);
 // FRL_CGR_END
 }
 CUDADevice::~CUDADevice()
 {
  texture_info.free();
-  cuda_assert(cuCtxDestroy(cuContext));
+// FRL_CGR BEGIN
  // do not destroy primary context
  //cuda_assert(cuCtxDestroy(cuContext));
 // FRL_CGR END
 }
 bool CUDADevice::support_device(const uint /*kernel_features*/)
--- a/intern/cycles/session/session_frl.cpp
+++ b/intern/cycles/session/session_frl.cpp
@ -6,6 +6,7 @@
 CCL_NAMESPACE_BEGIN
 using namespace std::chrono;
 using cgr_libcluster::ClusterSessionParams;
 ccl::float3 &toCyclesFloat3(cgr_libcluster::float3 &value);
 cgr_libcluster::float3 &fromCyclesFloat3(ccl::float3 &value);
@ -82,6 +83,7 @@ void Session::logRenderSessionPanelSettings() {
    "  num servers: "         << params.cluster_session_params.num_servers                 << std::endl <<
    "  compression quality: " << params.cluster_session_params.master_compression_quality  << std::endl <<
    "  master_image_color_format: " << params.cluster_session_params.master_image_color_format  << std::endl <<
    "  master_image_compressor: " << params.cluster_session_params.master_image_compressor  << std::endl <<
    "  master denoiser: "     << params.cluster_session_params.master_denoiser;
 }
@ -305,7 +307,8 @@ bool Session::server_wait_for_camera()
 //-------Methods used by client and standalone session to create NetCamera objects, serialize and transport (client) them
 //-------------------------------------------------------------------------------------------------------------------------
 //used by client session to initliaze a NetCamera for transport
-static void initializeNetCamera(cgr_libcluster::NetCamera& netCamera, Camera &cam, int s, int f, int sceneFrame, int iseed, cgr_libcluster::ClusterSessionParams& csp)
+static void initializeNetCamera(cgr_libcluster::NetCamera& netCamera, Camera &cam, int s, int f, int sceneFrame, int iseed,
    cgr_libcluster::ClusterSessionParams& csp)
 {
  netCamera.cam_matrix = fromCyclesTransform(cam.get_matrix());
  netCamera.cam_type = fromCyclesCameraType(cam.get_camera_type());
@ -329,11 +332,29 @@ static void initializeNetCamera(cgr_libcluster::NetCamera& netCamera, Camera &ca
  netCamera.master_denoiser = csp.master_denoiser;
  netCamera.master_image_color_format = csp.master_image_color_format;
  netCamera.master_image_compressor = csp.master_image_compressor;
  VLOG(3) << "Constructed net camera: " << netCamera.cam_width << " " << netCamera.cam_height;
 }
 void Session::client_send_camera(int samples)
 {
 #ifdef WITH_CUDA
  if (theClient &&
      params.cluster_session_params.master_image_compressor == ClusterSessionParams::MASTER_IMAGE_COMPRESSOR_NVENCODER &&
      (scene->camera->get_full_width() % 2 || scene->camera->get_full_height() % 2)) {
    std::string message = "NVENCODER compressor is requested for images with odd dimension: " +
      std::to_string(scene->camera->get_full_width()) + "x" + std::to_string(scene->camera->get_full_height()) +
      " Current implementation of NVENCODER only supports images with even dimensions.\
 To use NVENCODER compressor please resize image so it has even dimension like 1280x720";
      throw std::runtime_error(message);
  }
 #else
  if(params.cluster_session_params.master_image_compressor == ClusterSessionParams::MASTER_IMAGE_COMPRESSOR_NVENCODER) {
    throw std::runtime_error("ERROR. NVENCODER compressor is requested. Client is compiled without CUDA support\
 so has no nvencoder and will not be able to decode received images which are nvencoded.\
 Recompile with CUDA or use JPEG compressor instead. Terminating.");
  }
 #endif
  //TODO Modify object msg need to be handled properly
  if (theClient) {
    cgr_libcluster::ModifyObjectParams & modify_object_params = params.cluster_session_params.modify_object_params;
@ -349,7 +370,6 @@ void Session::client_send_camera(int samples)
    int scene_frame = scene->getCurrentFrame();
    initializeNetCamera(netCamera, *(scene->camera), samples, frame_count, scene_frame, iseed, params.cluster_session_params);
    if (theClient) {
      //client send  NetCamera object to master
      theClient->send_camera(netCamera);
--- a/source/blender/makesdna/CMakeLists.txt
+++ b/source/blender/makesdna/CMakeLists.txt
@ -5,4 +5,16 @@ if(WITH_FREESTYLE)
  add_definitions(-DWITH_FREESTYLE)
 endif()
 #FRL_CLR_BEGIN
 if(WITH_CYCLES_DEVICE_CUDA)
  add_definitions(-DWITH_CUDA)
 endif()
 if(WITH_CYCLES_DEVICE_OPTIX)
  add_definitions(-DWITH_OPTIX)
 endif()
 #FRL_CLR_END
 add_subdirectory(intern)
--- a/source/blender/makesdna/DNA_camera_types.h
+++ b/source/blender/makesdna/DNA_camera_types.h
@ -146,6 +146,7 @@ enum {
  CAM_SHOWSENSOR = (1 << 8),
  CAM_SHOW_SAFE_CENTER = (1 << 9),
  CAM_SHOW_BG_IMAGE = (1 << 10),
  CAM_SHOW_KINECT_AZURE = (1 << 11), // Custom FB
 };
 /* Sensor fit */
--- a/source/blender/makesdna/DNA_node_types.h
+++ b/source/blender/makesdna/DNA_node_types.h
@ -1657,6 +1657,7 @@ typedef struct NodeShaderMix {
 #define SHD_GLOSSY_GGX 2
 #define SHD_GLOSSY_ASHIKHMIN_SHIRLEY 3
 #define SHD_GLOSSY_MULTI_GGX 4
 #define SHD_GLOSSY_GGX_FRESNEL_REFRACTION 5
 /* vector transform */
 #define SHD_VECT_TRANSFORM_TYPE_VECTOR 0
--- a/source/blender/makesdna/DNA_scene_types.h
+++ b/source/blender/makesdna/DNA_scene_types.h
@ -850,7 +850,8 @@ typedef struct RenderData {
  int schedule_modify_object_message;
  int device_scale_factor;
  int master_image_color_format;
-  char _pad_frl_fields[4]; // add padding if needed to satify aligning check (by 8 bytes)
+  int master_image_compressor;
  //char _pad_frl_fields[4]; // add padding if needed to satify aligning check (by 8 bytes)
  /* WebRTC related */
  int use_webrtc;
@ -898,7 +899,9 @@ typedef enum {
 typedef enum {
  MASTER_DENOISER_NONE = 0,
  MASTER_DENOISER_OIDN = 1,
 #ifdef WITH_OPTIX
  MASTER_DENOISER_OPTIX = 2,
 #endif //WITH_OPTIX
  MASTER_DENOISER_BARCELONA = 3,
 } eMasterDenoiser;
@ -908,6 +911,14 @@ typedef enum {
  MASTER_IMAGE_COLOR_FORMAT_SRGB = 2,
 } eMasterImageColorFormat;
 /* RenderData.master_image_compressor */
 typedef enum {
  MASTER_IMAGE_COMPRESSOR_JPEG = 1,
 #ifdef WITH_CUDA
  MASTER_IMAGE_COMPRESSOR_NVENCODER = 2,
 #endif //WITH_CUDA
 } eMasterImageCompressor;
 /* RenderData.peer_connection_protocol */
 typedef enum {
  PEER_CONNECTION_PROTOCOL_ANY = 0,
--- a/source/blender/makesdna/intern/CMakeLists.txt
+++ b/source/blender/makesdna/intern/CMakeLists.txt
@ -20,6 +20,17 @@ set(LIB
 )
 add_definitions(-DWITH_DNA_GHASH)
 #FRL_CLR_BEGIN
 #if(WITH_CYCLES_DEVICE_CUDA)
 #  add_definitions(-DWITH_CUDA)
 #endif()
 if(WITH_CYCLES_DEVICE_OPTIX)
  add_definitions(-DWITH_OPTIX)
 endif()
 #FRL_CLR_END
 # Needed for `mallocn.c`.
 if(HAVE_MALLOC_STATS_H)
--- a/source/blender/makesrna/intern/CMakeLists.txt
+++ b/source/blender/makesrna/intern/CMakeLists.txt
@ -241,6 +241,18 @@ if(WITH_PYTHON)
  )
 endif()
 #FRL_CLR_BEGIN
  if(WITH_CYCLES_DEVICE_CUDA)
    add_definitions(-DWITH_CUDA)
  endif()
  if(WITH_CYCLES_DEVICE_OPTIX)
    add_definitions(-DWITH_OPTIX)
  endif()
 #FRL_CLR_END
 if(WITH_IMAGE_OPENEXR)
  add_definitions(-DWITH_OPENEXR)
 endif()
--- a/source/blender/makesrna/intern/rna_scene.c
+++ b/source/blender/makesrna/intern/rna_scene.c
@ -6280,7 +6280,7 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
       0,
       "OpenImageDenoise",
       "Master denoises images with Intel Open Image Denoise"},
-       {MASTER_DENOISER_OPTIX,
+      {MASTER_DENOISER_OPTIX,
       "OPTIX",
       0,
       "OptiX",
@ -6307,6 +6307,22 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
      {0, NULL, 0, NULL, NULL},
  };
  static const EnumPropertyItem master_image_compressor_items[] = {
      {MASTER_IMAGE_COMPRESSOR_JPEG,
       "JPEG",
       0,
       "JPEG",
       "Master compresses images with JPEG"},
 #ifdef WITH_CUDA
      {MASTER_IMAGE_COMPRESSOR_NVENCODER,
       "NVENCODER",
       0,
       "NVENCODER",
       "Master compresses images with NVENCODER."},
 #endif //WITH_CUDA
      {0, NULL, 0, NULL, NULL},
  };
  static const EnumPropertyItem render_session_mode_items[] = {
      {RENDER_SESSION_MODE_STANDALONE,
       "STANDALONE",
@ -6452,6 +6468,12 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
  RNA_def_property_ui_text(prop, "Color format", "");
  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
  prop = RNA_def_property(srna, "master_image_compressor", PROP_ENUM, PROP_NONE);
  RNA_def_property_enum_items(prop, master_image_compressor_items);
  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
  RNA_def_property_ui_text(prop, "Image compressor", "");
  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
  // Modify object name section
  prop = RNA_def_property(srna, "modify_object_name", PROP_STRING, PROP_NONE);
  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
--- a/source/creator/CMakeLists.txt
+++ b/source/creator/CMakeLists.txt
@ -112,6 +112,18 @@ if(WITH_OPENCOLORIO)
  add_definitions(-DWITH_OCIO)
 endif()
 #FRL_CLR_BEGIN
 if(WITH_CYCLES_DEVICE_CUDA)
  add_definitions(-DWITH_CUDA)
 endif()
 if(WITH_CYCLES_DEVICE_OPTIX)
  add_definitions(-DWITH_OPTIX)
 endif()
 #FRL_CLR_END
 # Setup the EXE sources and `buildinfo`.
 set(SRC
  creator.c
--- a/source/creator/creator_args.c
+++ b/source/creator/creator_args.c
@ -2021,9 +2021,13 @@ static int arg_handle_master_denoiser(int argc, const char **argv, void *data)
      const char *rmtype = argv[1];
      if (BLI_strcasecmp(rmtype, "OIDN") == 0) {
        scene->r.master_denoiser = MASTER_DENOISER_OIDN;
-      } else if (BLI_strcasecmp(rmtype, "OPTIX") == 0) {
+      }
 #ifdef WITH_OPTIX
      else if (BLI_strcasecmp(rmtype, "OPTIX") == 0) {
        scene->r.master_denoiser = MASTER_DENOISER_OPTIX;
-      } else if (BLI_strcasecmp(rmtype, "BARCELONA") == 0) {
+      }
 #endif
      else if (BLI_strcasecmp(rmtype, "BARCELONA") == 0) {
        scene->r.master_denoiser = MASTER_DENOISER_BARCELONA;
      } else {
        printf("\nError: Unknown master denoiser %s (--master-denoiser <OIDN> or <OPTIX> or <BARCELONA>).\n", rmtype);
@ -2065,11 +2069,48 @@ static int arg_handle_master_image_color_format(int argc, const char **argv, voi
  else {
    printf(
        "\nError: no blend loaded. "
-        "order the arguments so '--master_image_color_format' is after the blend is loaded.\n");
+        "order the arguments so '--master-image-color-format' is after the blend is loaded.\n");
    return 0;
  }
 }
 static const char arg_handle_master_image_compressor_doc[] =
    "<master_image_compression>\n"
    "\tCompressor that master uses to compress images before sending them to a client.\n"
    "\tSupported values: JPEG, NVENCODER\n";
 static int arg_handle_master_image_compressor(int argc, const char **argv, void *data)
 {
  bContext *C = data;
  Scene *scene = CTX_data_scene(C);
  scene->r.master_image_color_format = MASTER_IMAGE_COMPRESSOR_JPEG;
  if (scene) {
    if (argc > 1) {
      const char *rmtype = argv[1];
      if (BLI_strcasecmp(rmtype, "JPEG") == 0) {
        scene->r.master_image_color_format = MASTER_IMAGE_COMPRESSOR_JPEG;
      }
 #ifdef WITH_CUDA
      else if (BLI_strcasecmp(rmtype, "NVENCODER") == 0) {
        scene->r.master_image_color_format = MASTER_IMAGE_COMPRESSOR_NVENCODER;
      }
 #endif //WITH_CUDA
      else {
        printf("\nError: Unknown compressor %s (--master-image-compressor <JPEG> or <NVENCODER>).\n", rmtype);
      }
    }
    return 1;
  }
  else {
    printf(
        "\nError: no blend loaded. "
        "order the arguments so '--master-image-compressor' is after the blend is loaded.\n");
    return 0;
  }
 }
 #ifdef WITH_WEBRTC
 static int arg_handle_webrtc_int_param(int argc, const char **argv, const char *arg_id, int *param, int default_param_value)
@ -3050,6 +3091,7 @@ void main_args_setup(bContext *C, bArgs *ba)
  BLI_args_add(ba, NULL, "--save_denoise_io", CB(arg_handle_save_denoise_io), C);
  BLI_args_add(ba, NULL, "--save_cameras", CB(arg_handle_save_cameras), C);
  BLI_args_add(ba, NULL, "--master-image-color-format", CB(arg_handle_master_image_color_format), C);
  BLI_args_add(ba, NULL, "--master-image-compressor", CB(arg_handle_master_image_compressor), C);
 #ifdef WITH_WEBRTC
  BLI_args_add(ba, NULL, "--use-webrtc", CB(arg_handle_use_webrtc), C);
  BLI_args_add(ba, NULL, "--signaling-server-address", CB(arg_handle_signaling_server_address), C);