2023-03-31 17:26:02 +02:00
77 changed files with 7843 additions and 734 deletions
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@ -17,7 +17,11 @@ endif()

 add_subdirectory(rangetree)
 add_subdirectory(wcwidth)
+#FRL_CLR_BEGIN
+if(UNIX AND NOT APPLE)
  add_subdirectory(perceptualdiff)
+endif()
+#FRL_CLR_END

 if(WITH_BULLET)
  if(NOT WITH_SYSTEM_BULLET)
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Deprecation_Notices.pdf
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Deprecation_Notices.pdf
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/cuviddec.h
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/cuviddec.h
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvEncodeAPI.h
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvEncodeAPI.h
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvcuvid.h
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvcuvid.h
@ -0,0 +1,436 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2021 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/********************************************************************************************************************/
+//! \file nvcuvid.h
+//!   NVDECODE API provides video decoding interface to NVIDIA GPU devices.
+//! \date 2015-2020
+//!  This file contains the interface constants, structure definitions and function prototypes.
+/********************************************************************************************************************/
+
+#if !defined(__NVCUVID_H__)
+#define __NVCUVID_H__
+
+#include "cuviddec.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+
+/***********************************************/
+//!
+//! High-level helper APIs for video sources
+//!
+/***********************************************/
+
+typedef void *CUvideosource;
+typedef void *CUvideoparser;
+typedef long long CUvideotimestamp;
+
+
+/************************************************************************/
+//! \enum cudaVideoState
+//! Video source state enums
+//! Used in cuvidSetVideoSourceState and cuvidGetVideoSourceState APIs
+/************************************************************************/
+typedef enum {
+    cudaVideoState_Error   = -1,    /**< Error state (invalid source)                  */
+    cudaVideoState_Stopped = 0,     /**< Source is stopped (or reached end-of-stream)  */
+    cudaVideoState_Started = 1      /**< Source is running and delivering data         */
+} cudaVideoState;
+
+/************************************************************************/
+//! \enum cudaAudioCodec
+//! Audio compression enums
+//! Used in CUAUDIOFORMAT structure
+/************************************************************************/
+typedef enum {
+    cudaAudioCodec_MPEG1=0,         /**< MPEG-1 Audio               */
+    cudaAudioCodec_MPEG2,           /**< MPEG-2 Audio               */
+    cudaAudioCodec_MP3,             /**< MPEG-1 Layer III Audio     */
+    cudaAudioCodec_AC3,             /**< Dolby Digital (AC3) Audio  */
+    cudaAudioCodec_LPCM,            /**< PCM Audio                  */
+    cudaAudioCodec_AAC,             /**< AAC Audio                  */
+} cudaAudioCodec;
+
+/************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMAT
+//! Video format
+//! Used in cuvidGetSourceVideoFormat API
+/************************************************************************************************/
+typedef struct
+{
+    cudaVideoCodec codec;                   /**< OUT: Compression format          */
+   /**
+    * OUT: frame rate = numerator / denominator (for example: 30000/1001)
+    */
+    struct {
+        /**< OUT: frame rate numerator   (0 = unspecified or variable frame rate) */
+        unsigned int numerator;
+        /**< OUT: frame rate denominator (0 = unspecified or variable frame rate) */
+        unsigned int denominator;
+    } frame_rate;
+    unsigned char progressive_sequence;     /**< OUT: 0=interlaced, 1=progressive                                      */
+    unsigned char bit_depth_luma_minus8;    /**< OUT: high bit depth luma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth   */
+    unsigned char bit_depth_chroma_minus8;  /**< OUT: high bit depth chroma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */
+    unsigned char min_num_decode_surfaces;  /**< OUT: Minimum number of decode surfaces to be allocated for correct
+                                                      decoding. The client can send this value in ulNumDecodeSurfaces
+                                                      (in CUVIDDECODECREATEINFO structure).
+                                                      This guarantees correct functionality and optimal video memory
+                                                      usage but not necessarily the best performance, which depends on
+                                                      the design of the overall application. The optimal number of
+                                                      decode surfaces (in terms of performance and memory utilization)
+                                                      should be decided by experimentation for each application, but it
+                                                      cannot go below min_num_decode_surfaces.
+                                                      If this value is used for ulNumDecodeSurfaces then it must be
+                                                      returned to parser during sequence callback.                     */
+    unsigned int coded_width;               /**< OUT: coded frame width in pixels                                      */
+    unsigned int coded_height;              /**< OUT: coded frame height in pixels                                     */
+   /**
+    * area of the frame that should be displayed
+    * typical example:
+    * coded_width = 1920, coded_height = 1088
+    * display_area = { 0,0,1920,1080 }
+    */
+    struct {
+        int left;                           /**< OUT: left position of display rect    */
+        int top;                            /**< OUT: top position of display rect     */
+        int right;                          /**< OUT: right position of display rect   */
+        int bottom;                         /**< OUT: bottom position of display rect  */
+    } display_area;
+    cudaVideoChromaFormat chroma_format;    /**< OUT:  Chroma format                   */
+    unsigned int bitrate;                   /**< OUT: video bitrate (bps, 0=unknown)   */
+   /**
+    * OUT: Display Aspect Ratio = x:y (4:3, 16:9, etc)
+    */
+    struct {
+        int x;
+        int y;
+    } display_aspect_ratio;
+    /**
+    * Video Signal Description
+    * Refer section E.2.1 (VUI parameters semantics) of H264 spec file
+    */
+    struct {
+        unsigned char video_format          : 3; /**< OUT: 0-Component, 1-PAL, 2-NTSC, 3-SECAM, 4-MAC, 5-Unspecified     */
+        unsigned char video_full_range_flag : 1; /**< OUT: indicates the black level and luma and chroma range           */
+        unsigned char reserved_zero_bits    : 4; /**< Reserved bits                                                      */
+        unsigned char color_primaries;           /**< OUT: chromaticity coordinates of source primaries                  */
+        unsigned char transfer_characteristics;  /**< OUT: opto-electronic transfer characteristic of the source picture */
+        unsigned char matrix_coefficients;       /**< OUT: used in deriving luma and chroma signals from RGB primaries   */
+    } video_signal_description;
+    unsigned int seqhdr_data_length;             /**< OUT: Additional bytes following (CUVIDEOFORMATEX)                  */
+} CUVIDEOFORMAT;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDOPERATINGPOINTINFO
+//! Operating point information of scalable bitstream
+/****************************************************************/
+typedef struct 
+{
+    cudaVideoCodec codec;
+    union 
+    {
+        struct
+        {
+            unsigned char  operating_points_cnt;
+            unsigned char  reserved24_bits[3];
+            unsigned short operating_points_idc[32];
+        } av1;
+        unsigned char CodecReserved[1024];
+    };
+} CUVIDOPERATINGPOINTINFO;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDAV1SEQHDR
+//! AV1 specific sequence header information
+/****************************************************************/
+typedef struct {
+    unsigned int max_width;
+    unsigned int max_height;
+    unsigned char reserved[1016];
+} CUVIDAV1SEQHDR;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMATEX
+//! Video format including raw sequence header information
+//! Used in cuvidGetSourceVideoFormat API
+/****************************************************************/
+typedef struct
+{
+    CUVIDEOFORMAT format;                 /**< OUT: CUVIDEOFORMAT structure */
+    union {
+        CUVIDAV1SEQHDR av1;
+        unsigned char raw_seqhdr_data[1024];  /**< OUT: Sequence header data    */
+    };
+} CUVIDEOFORMATEX;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUAUDIOFORMAT
+//! Audio formats
+//! Used in cuvidGetSourceAudioFormat API
+/****************************************************************/
+typedef struct
+{
+    cudaAudioCodec codec;       /**< OUT: Compression format                                              */
+    unsigned int channels;      /**< OUT: number of audio channels                                        */
+    unsigned int samplespersec; /**< OUT: sampling frequency                                              */
+    unsigned int bitrate;       /**< OUT: For uncompressed, can also be used to determine bits per sample */
+    unsigned int reserved1;     /**< Reserved for future use                                              */
+    unsigned int reserved2;     /**< Reserved for future use                                              */
+} CUAUDIOFORMAT;
+
+
+/***************************************************************/
+//! \enum CUvideopacketflags
+//! Data packet flags
+//! Used in CUVIDSOURCEDATAPACKET structure
+/***************************************************************/
+typedef enum {
+    CUVID_PKT_ENDOFSTREAM   = 0x01,   /**< Set when this is the last packet for this stream                              */
+    CUVID_PKT_TIMESTAMP     = 0x02,   /**< Timestamp is valid                                                            */
+    CUVID_PKT_DISCONTINUITY = 0x04,   /**< Set when a discontinuity has to be signalled                                  */
+    CUVID_PKT_ENDOFPICTURE  = 0x08,   /**< Set when the packet contains exactly one frame or one field                   */
+    CUVID_PKT_NOTIFY_EOS    = 0x10,   /**< If this flag is set along with CUVID_PKT_ENDOFSTREAM, an additional (dummy)
+                                           display callback will be invoked with null value of CUVIDPARSERDISPINFO which
+                                           should be interpreted as end of the stream.                                   */
+} CUvideopacketflags;
+
+/*****************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEDATAPACKET
+//! Data Packet
+//! Used in cuvidParseVideoData API
+//! IN for cuvidParseVideoData
+/*****************************************************************************/
+typedef struct _CUVIDSOURCEDATAPACKET
+{
+    unsigned long flags;            /**< IN: Combination of CUVID_PKT_XXX flags                              */
+    unsigned long payload_size;     /**< IN: number of bytes in the payload (may be zero if EOS flag is set) */
+    const unsigned char *payload;   /**< IN: Pointer to packet payload data (may be NULL if EOS flag is set) */
+    CUvideotimestamp timestamp;     /**< IN: Presentation time stamp (10MHz clock), only valid if
+                                             CUVID_PKT_TIMESTAMP flag is set                                 */
+} CUVIDSOURCEDATAPACKET;
+
+// Callback for packet delivery
+typedef int (CUDAAPI *PFNVIDSOURCECALLBACK)(void *, CUVIDSOURCEDATAPACKET *);
+
+/**************************************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEPARAMS
+//! Describes parameters needed in cuvidCreateVideoSource API
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported
+//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
+/**************************************************************************************************************************/
+typedef struct _CUVIDSOURCEPARAMS
+{
+    unsigned int ulClockRate;                   /**< IN: Time stamp units in Hz (0=default=10000000Hz)      */
+    unsigned int bAnnexb : 1;                   /**< IN: AV1 annexB stream                                  */
+    unsigned int uReserved : 31;                /**< Reserved for future use - set to zero                  */
+    unsigned int uReserved1[6];                 /**< Reserved for future use - set to zero                  */
+    void *pUserData;                            /**< IN: User private data passed in to the data handlers   */
+    PFNVIDSOURCECALLBACK pfnVideoDataHandler;   /**< IN: Called to deliver video packets                    */
+    PFNVIDSOURCECALLBACK pfnAudioDataHandler;   /**< IN: Called to deliver audio packets.                   */
+    void *pvReserved2[8];                       /**< Reserved for future use - set to NULL                  */
+} CUVIDSOURCEPARAMS;
+
+
+/**********************************************/
+//! \ingroup ENUMS
+//! \enum CUvideosourceformat_flags
+//! CUvideosourceformat_flags
+//! Used in cuvidGetSourceVideoFormat API
+/**********************************************/
+typedef enum {
+    CUVID_FMT_EXTFORMATINFO = 0x100             /**< Return extended format structure (CUVIDEOFORMATEX) */
+} CUvideosourceformat_flags;
+
+#if !defined(__APPLE__)
+/***************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams)
+//! Create CUvideosource object. CUvideosource spawns demultiplexer thread that provides two callbacks: 
+//! pfnVideoDataHandler() and pfnAudioDataHandler()
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported 
+//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
+/***************************************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams);
+
+/***************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams)
+//! Create video source
+/***************************************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams);
+
+/********************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj)
+//! Destroy video source
+/********************************************************************/
+CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj);
+
+/******************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state)
+//! Set video source state to:
+//! cudaVideoState_Started - to signal the source to run and deliver data
+//! cudaVideoState_Stopped - to stop the source from delivering the data
+//! cudaVideoState_Error   - invalid source
+/******************************************************************************************/
+CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state);
+
+/******************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj)
+//! Get video source state
+//! Returns:
+//! cudaVideoState_Started - if Source is running and delivering data
+//! cudaVideoState_Stopped - if Source is stopped or reached end-of-stream
+//! cudaVideoState_Error   - if Source is in error state
+/******************************************************************************************/
+cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj);
+
+/******************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags)
+//! Gets video source format in pvidfmt, flags is set to combination of CUvideosourceformat_flags as per requirement
+/******************************************************************************************************************/
+CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags);
+
+/**************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags)
+//! Get audio source format
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported 
+//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
+/**************************************************************************************************************************/
+CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags);
+
+#endif
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERDISPINFO
+//! Used in cuvidParseVideoData API with PFNVIDDISPLAYCALLBACK pfnDisplayPicture
+/**********************************************************************************/
+typedef struct _CUVIDPARSERDISPINFO
+{
+    int picture_index;          /**< OUT: Index of the current picture                                                         */
+    int progressive_frame;      /**< OUT: 1 if progressive frame; 0 otherwise                                                  */
+    int top_field_first;        /**< OUT: 1 if top field is displayed first; 0 otherwise                                       */
+    int repeat_first_field;     /**< OUT: Number of additional fields (1=ivtc, 2=frame doubling, 4=frame tripling, 
+                                     -1=unpaired field)                                                                        */
+    CUvideotimestamp timestamp; /**< OUT: Presentation time stamp                                                              */
+} CUVIDPARSERDISPINFO;
+
+/***********************************************************************************************************************/
+//! Parser callbacks
+//! The parser will call these synchronously from within cuvidParseVideoData(), whenever there is sequence change or a picture
+//! is ready to be decoded and/or displayed. First argument in functions is "void *pUserData" member of structure CUVIDSOURCEPARAMS
+//! Return values from these callbacks are interpreted as below. If the callbacks return failure, it will be propagated by
+//! cuvidParseVideoData() to the application.
+//! Parser picks default operating point as 0 and outputAllLayers flag as 0 if PFNVIDOPPOINTCALLBACK is not set or return value is 
+//! -1 or invalid operating point.
+//! PFNVIDSEQUENCECALLBACK : 0: fail, 1: succeeded, > 1: override dpb size of parser (set by CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces
+//! while creating parser)
+//! PFNVIDDECODECALLBACK   : 0: fail, >=1: succeeded
+//! PFNVIDDISPLAYCALLBACK  : 0: fail, >=1: succeeded
+//! PFNVIDOPPOINTCALLBACK  : <0: fail, >=0: succeeded (bit 0-9: OperatingPoint, bit 10-10: outputAllLayers, bit 11-30: reserved)
+/***********************************************************************************************************************/
+typedef int (CUDAAPI *PFNVIDSEQUENCECALLBACK)(void *, CUVIDEOFORMAT *);
+typedef int (CUDAAPI *PFNVIDDECODECALLBACK)(void *, CUVIDPICPARAMS *);
+typedef int (CUDAAPI *PFNVIDDISPLAYCALLBACK)(void *, CUVIDPARSERDISPINFO *);
+typedef int (CUDAAPI *PFNVIDOPPOINTCALLBACK)(void *, CUVIDOPERATINGPOINTINFO*);
+
+/**************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERPARAMS
+//! Used in cuvidCreateVideoParser API
+/**************************************/
+typedef struct _CUVIDPARSERPARAMS
+{
+    cudaVideoCodec CodecType;                   /**< IN: cudaVideoCodec_XXX                                                  */
+    unsigned int ulMaxNumDecodeSurfaces;        /**< IN: Max # of decode surfaces (parser will cycle through these)          */
+    unsigned int ulClockRate;                   /**< IN: Timestamp units in Hz (0=default=10000000Hz)                        */
+    unsigned int ulErrorThreshold;              /**< IN: % Error threshold (0-100) for calling pfnDecodePicture (100=always 
+                                                     IN: call pfnDecodePicture even if picture bitstream is fully corrupted) */
+    unsigned int ulMaxDisplayDelay;             /**< IN: Max display queue delay (improves pipelining of decode with display)
+                                                         0=no delay (recommended values: 2..4)                               */
+    unsigned int bAnnexb : 1;                   /**< IN: AV1 annexB stream                                                   */
+    unsigned int uReserved : 31;                /**< Reserved for future use - set to zero                                   */
+    unsigned int uReserved1[4];                 /**< IN: Reserved for future use - set to 0                                  */
+    void *pUserData;                            /**< IN: User data for callbacks                                             */
+    PFNVIDSEQUENCECALLBACK pfnSequenceCallback; /**< IN: Called before decoding frames and/or whenever there is a fmt change */
+    PFNVIDDECODECALLBACK pfnDecodePicture;      /**< IN: Called when a picture is ready to be decoded (decode order)         */
+    PFNVIDDISPLAYCALLBACK pfnDisplayPicture;    /**< IN: Called whenever a picture is ready to be displayed (display order)  */
+    PFNVIDOPPOINTCALLBACK pfnGetOperatingPoint; /**< IN: Called from AV1 sequence header to get operating point of a AV1 
+                                                         scalable bitstream                                                  */
+    void *pvReserved2[6];                       /**< Reserved for future use - set to NULL                                   */
+    CUVIDEOFORMATEX *pExtVideoInfo;             /**< IN: [Optional] sequence header data from system layer                   */
+} CUVIDPARSERPARAMS;
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams)
+//! Create video parser object and initialize
+/************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams);
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket)
+//! Parse the video data from source data packet in pPacket 
+//! Extracts parameter sets like SPS, PPS, bitstream etc. from pPacket and 
+//! calls back pfnDecodePicture with CUVIDPICPARAMS data for kicking of HW decoding
+//! calls back pfnSequenceCallback with CUVIDEOFORMAT data for initial sequence header or when
+//! the decoder encounters a video format change
+//! calls back pfnDisplayPicture with CUVIDPARSERDISPINFO data to display a video frame
+/************************************************************************************************/
+CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket);
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj)
+//! Destroy the video parser
+/************************************************************************************************/
+CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj);
+
+/**********************************************************************************************/
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif // __NVCUVID_H__
+
+
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/LicenseAgreement.pdf
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/LicenseAgreement.pdf
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/NOTICES.txt
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/NOTICES.txt
@ -0,0 +1,167 @@
+This SDK includes portions of FFMPEG, under the following license:
+
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Read_Me.pdf
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Read_Me.pdf
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/Release_Notes.pdf
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/Release_Notes.pdf
--- a/extern/nvidia/Video_Codec_SDK_11.1.5/rl_readme.txt
+++ b/extern/nvidia/Video_Codec_SDK_11.1.5/rl_readme.txt
@ -0,0 +1,9 @@
+Reality Labs notes.
+
+This is shortened set of NVIDIA video codec SDK.
+
+Folder Doc, Lib and Samples are removed since we do not use them anyway.
+Only Interface folder and readme and license files are left.
+
+
+
--- a/extern/perceptualdiff/test/Aqsis_vase.png
+++ b/extern/perceptualdiff/test/Aqsis_vase.png
--- a/extern/perceptualdiff/test/Aqsis_vase_ref.png
+++ b/extern/perceptualdiff/test/Aqsis_vase_ref.png
--- a/extern/perceptualdiff/test/Bug1102605.tif
+++ b/extern/perceptualdiff/test/Bug1102605.tif
--- a/extern/perceptualdiff/test/Bug1102605_ref.tif
+++ b/extern/perceptualdiff/test/Bug1102605_ref.tif
--- a/extern/perceptualdiff/test/Bug1471457.tif
+++ b/extern/perceptualdiff/test/Bug1471457.tif
--- a/extern/perceptualdiff/test/Bug1471457_ref.tif
+++ b/extern/perceptualdiff/test/Bug1471457_ref.tif
--- a/extern/perceptualdiff/test/cam_mb.tif
+++ b/extern/perceptualdiff/test/cam_mb.tif
--- a/extern/perceptualdiff/test/cam_mb_ref.tif
+++ b/extern/perceptualdiff/test/cam_mb_ref.tif
--- a/extern/perceptualdiff/test/fish1.png
+++ b/extern/perceptualdiff/test/fish1.png
--- a/extern/perceptualdiff/test/fish2.png
+++ b/extern/perceptualdiff/test/fish2.png
--- a/extern/perceptualdiff/test/run_tests.sh
+++ b/extern/perceptualdiff/test/run_tests.sh
@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Script to run pdiff against a set of image file pairs, and check that the
+# PASS or FAIL status is as expected.
+
+#------------------------------------------------------------------------------
+# Image files and expected perceptualdiff PASS/FAIL status.  Line format is
+# (PASS|FAIL) image1.(tif|png) image2.(tif|png)
+#
+# Edit the following lines to add additional tests.
+function all_tests {
+cat <<EOF
+FAIL	Bug1102605_ref.tif	Bug1102605.tif
+PASS	Bug1471457_ref.tif	Bug1471457.tif
+PASS	cam_mb_ref.tif		cam_mb.tif
+FAIL	fish2.png			fish1.png
+EOF
+}
+
+# Modify pdiffBinary to point to your compiled pdiff executable if desired.
+pdiffBinary=../perceptualdiff
+
+#------------------------------------------------------------------------------
+
+totalTests=0
+numTestsFailed=0
+
+# Run all tests.
+while read expectedResult image1 image2 ; do
+	if $pdiffBinary -verbose $image1 $image2 | grep -q "^$expectedResult" ; then
+		totalTests=$(($totalTests+1))
+	else
+		numTestsFailed=$(($numTestsFailed+1))
+		echo "Regression failure: expected $expectedResult for \"$pdiffBinary $image1 $image2\"" >&2 
+	fi
+done <<EOF
+$(all_tests)
+EOF
+# (the above with the EOF's is a stupid bash trick to stop while from running
+# in a subshell)
+
+# Give some diagnostics:
+if [[ $numTestsFailed == 0 ]] ; then
+	echo "*** all $totalTests tests passed"
+else
+	echo "*** $numTestsFailed failed tests of $totalTests"
+fi
+
+exit $numTestsFailed
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@ -276,7 +276,12 @@ if(WITH_CYCLES_DEVICE_METAL)
 endif()

 if(WITH_CYCLES_DEVICE_ONEAPI)
-  add_definitions(-DWITH_ONEAPI)
+# FRL_CLR_BEGIN
+  #TODO: T146077207
+  message(STATUS "Temporarily turn WITH_ONEAPI off on monaco-cluster branch due to it does not compile there. TODO: fix and put it back. Task: T146077207 ")
+#  add_definitions(-DWITH_ONEAPI)
+# FRL_CLR_END
+
 endif()

 if(WITH_CYCLES_EMBREE)
--- a/intern/cycles/app/headless_light_client/CMakeLists.txt
+++ b/intern/cycles/app/headless_light_client/CMakeLists.txt
@ -8,58 +8,63 @@ set(INC
  ../../../../extern                  # for flatbuffers
  ${TurboJPEG_INCLUDE_DIRS}
  ${OPENIMAGEIO_INCLUDE_DIR}
-#  ${CUDA_TOOLKIT_INCLUDE}
  ../../cluster_rendering/libcluster
  ../../cluster_rendering/libcluster/compression # for nv_encoder
  ../../cluster_rendering/libnetwork
  ${NVCODEC_PUBLIC_INTERFACE_DIR}
+  ${CMAKE_CURRENT_BINARY_DIR}
 )

-if(WITH_WEBRTC)
-  list(APPEND INC ../../cluster_rendering/libstream/include)
-endif(WITH_WEBRTC)
-
 set(INC_SYS
 )
-if(NOT DEFINED NVENCODEAPI_LIB)
-  if(WIN32)
-    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-      find_library(NVENCODEAPI_LIB NAMES nvencodeapi HINTS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../lib/Video_Codec_SDK_11.1.5/Lib/" PATH_SUFFIXES x64/)
-    else()
-      find_library(NVENCODEAPI_LIB NAMES nvencodeapi HINTS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../lib/Video_Codec_SDK_11.1.5/Lib/" PATH_SUFFIXES Win32/)
-    endif()
-  else()
-    find_library(NVENCODEAPI_LIB NAMES nvidia-encode HINTS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../lib/Video_Codec_SDK_11.1.5/Lib/" PATH_SUFFIXES linux/stubs/x86_64/)
-    if(NOT DEFINED NVENCODEAPI_LIB)
-      find_library(NVENCODEAPI_LIB NAMES nvidia-encode HINTS "/Lib64/")
-    endif()
-  endif()
-  if(NVENCODEAPI_LIB)
-    message(STATUS "Found NVidia encoding library:${NVENCODEAPI_LIB}")
-  else()
-    message(WARNING "NVidia encoding library not found.")
-  endif()
-else()
-  message(STATUS "Using existing NVENCODEAPI_LIB, skipping library search")
-endif()

 find_package(Boost 1.48 COMPONENTS program_options serialization thread filesystem)

-add_definitions(-DHEADLESS_CLIENT)
-
 set(LIBRARIES
-  cycles_libnetwork
  cycles_libcluster
+  cycles_libnetwork
  ${TurboJPEG_LIBRARIES}
+  ${OPENCOLORIO_LIBRARIES}
+  ${OPENIMAGEIO_LIBRARY}
+  ${OPENIMAGEIO_UTIL_LIBRARY}
+  ${PNG_LIBRARIES}
  ${JPEG_LIBRARIES}
+  ${TIFF_LIBRARY}
+  ${OPENEXR_LIBRARIES}
  ${OPENJPEG_LIBRARIES}
+  ${ZLIB_LIBRARIES}
+  ${PUGIXML_LIBRARY}
+  ${WEBP_LIBRARIES}
  ${Boost_LIBRARIES}
  ${GLOG_LIBRARIES}
-  
 )

-if(WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX)
-   list(APPEND LIBRARIES ${NVENCODEAPI_LIB})  # For NVCODEC
+if(UNIX AND NOT APPLE)
+  list(APPEND LIB
+    bf_intern_libc_compat
+  )
+endif()
+
+if(WITH_CYCLES_DEVICE_CUDA AND UNIX)
+  if(NOT DEFINED NVENCODEAPI_LIB)
+    find_library(NVENCODEAPI_LIB nvidia-encode)
+    find_library(NVCUVID_LIB nvcuvid HINTS "/lib64/") # the same folder on dev laptop and docker image
+    if(NOT NVENCODEAPI_LIB)
+      message(ERROR "NVidia encoding library not found.")
+    endif()
+  else()
+    message(STATUS "Using pre-defined NVENCODEAPI_LIB: ${NVENCODEAPI_LIB}")
+  endif()
+  list(APPEND LIBRARIES ${NVENCODEAPI_LIB})
+  if(WITH_CUDA_DYNLOAD AND NOT CUDA_CUDA_LIBRARY)
+    list(APPEND INC "../../../../extern/cuew/include/")
+    list(APPEND LIBRARIES extern_cuew)
+    add_definitions(-DWITH_CUDA_DYNLOAD)
+  else()
+    list(APPEND INC ${CUDA_TOOLKIT_INCLUDE})
+    list(APPEND LIBRARIES ${CUDA_CUDA_LIBRARY})
+    remove_definitions(-DWITH_CUDA_DYNLOAD)
+  endif()
 endif()

 # Only use cuew if CUDA_CUDA_LIBRARY not sepcified
@ -82,20 +87,11 @@ else()
 endif()
 list(APPEND LIBRARIES ${CMAKE_DL_LIBS})

-link_directories(${BOOST_LIBPATH})
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})

 set(CMAKE_EXE_LINKER_FLAGS "-std=c++11 -pthread")
-file(GLOB SRC *.cpp)
-# Normally we should just link against libcluster and do not compile against source files in libcluster
-# however libcluster pulls tons of dependencies including server/master side dependencies 
-# most of which we do not need what makes this light client not light at all.
-# We may refactor libcluster to split it on client and master/server parts.
-# Since on this light client we only need a functionality of classes listed below
-# we work around dependencies issue by only compiling against following source files we need: 
-list(APPEND SRC ../../cluster_rendering/libcluster/rpc_blender_protocol.cpp)
-list(APPEND SRC ../../cluster_rendering/libcluster/net_camera.cpp)
+file(GLOB SRC *.cpp *.cu)

 add_executable(${TARGET} ${SRC})
 target_link_libraries(${TARGET} ${LIBRARIES})
--- a/intern/cycles/app/headless_light_client/config.cpp
+++ b/intern/cycles/app/headless_light_client/config.cpp
@ -28,8 +28,11 @@ const string Config::SAVE_IMAGES_PATH = "save_images_path";
 const string Config::SAVE_VIDEO_STREAM_PATH = "save_video_stream_path";
 const string Config::USER_EVENTS_PATH = "user_events_path";
 const string Config::MASTER_IMAGE_COLOR_FORMAT = "master_image_color_format";
+const string Config::MASTER_IMAGE_COMPRESSOR = "master_image_compressor";
 const string Config::COLOR_FORMAT_LINEAR = "linear";
 const string Config::COLOR_FORMAT_SRGB = "srgb";
+const string Config::IMAGE_COMPRESSOR_JPEG = "JPEG";
+const string Config::IMAGE_COMPRESSOR_NVENCODER = "NVENCODER";


 Config::Config() : 
@ -55,6 +58,12 @@ bool Config::init(int argc, char* argv[]) {
    COLOR_FORMAT_SRGB + ". " +
    "Default is: " + COLOR_FORMAT_SRGB;

+  const std::string supported_image_compressors =
+    IMAGE_COMPRESSOR_JPEG + ", " +
+    IMAGE_COMPRESSOR_NVENCODER + ". " +
+    "Default is: " + IMAGE_COMPRESSOR_JPEG;
+
+
  namespace po = boost::program_options;
  po::options_description options("Allowed options");
  options.add_options()
@ -78,6 +87,10 @@ if not provided, client uses the resolution net camera object was saved with")
    ((USER_EVENTS_PATH + ",u").c_str(), po::value<string>(), "path to a file with user events")
    ((MASTER_IMAGE_COLOR_FORMAT + ",i").c_str(), po::value<string>(),
      ("master image color format. Options: " + supported_color_formats).c_str())
+
+    ((MASTER_IMAGE_COMPRESSOR + ",i").c_str(), po::value<string>(),
+      ("master image compressor. Options: " + supported_image_compressors).c_str())
+
    ((DENOISER + ",d").c_str(), po::value<string>(), denoiser_help.c_str());
  po::variables_map vm;
  po::store(po::parse_command_line(argc, argv, options), vm);
@ -191,6 +204,19 @@ if not provided, client uses the resolution net camera object was saved with")
    }
  }

+  if (vm.count(MASTER_IMAGE_COMPRESSOR)) {
+    string master_image_compressor_str = vm[MASTER_IMAGE_COMPRESSOR].as<string>();
+    if(master_image_compressor_str == IMAGE_COMPRESSOR_JPEG) {
+      master_image_compressor = ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_JPEG;
+    } else if(master_image_compressor_str == IMAGE_COMPRESSOR_NVENCODER) {
+      master_image_compressor = ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_NVENCODER;
+    } else {
+      VLOG(1) << "FATAL. Requested image color format is not supported. Requested: " << master_image_compressor_str <<
+        " Supported: " << supported_color_formats;
+      return false;
+    }
+  }
+
  return true;
 }

--- a/intern/cycles/app/headless_light_client/config.h
+++ b/intern/cycles/app/headless_light_client/config.h
@ -35,8 +35,11 @@ class Config {
  static const string SAVE_VIDEO_STREAM_PATH;
  static const string USER_EVENTS_PATH;
  static const string MASTER_IMAGE_COLOR_FORMAT;
+  static const string MASTER_IMAGE_COMPRESSOR;
  static const string COLOR_FORMAT_LINEAR;
  static const string COLOR_FORMAT_SRGB;
+  static const string IMAGE_COMPRESSOR_JPEG;
+  static const string IMAGE_COMPRESSOR_NVENCODER;

 public:
  size_t start_frame_id = 0;
@ -60,6 +63,8 @@ public:
  ClusterSessionParams::MasterDenoiser master_denoiser;
  ClusterSessionParams::MasterImageColorFormat master_image_color_format =
    ClusterSessionParams::MasterImageColorFormat::MASTER_IMAGE_COLOR_FORMAT_SRGB;
+  ClusterSessionParams::MasterImageCompressor master_image_compressor =
+    ClusterSessionParams::ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_JPEG;

  Config();
  ~Config();
--- a/intern/cycles/app/headless_light_client/light_cluster_rendering_client.cpp
+++ b/intern/cycles/app/headless_light_client/light_cluster_rendering_client.cpp
@ -8,15 +8,23 @@

 #include "camera_provider.h"
 #include "cluster_session_params.h"
-#ifdef WITH_OPTIX
-#include "cuda_context.h"
+#ifdef WITH_CUDA
+#include "cuda_context_provider.h"
 #endif
+#include "image_io_util.h"
 #include "light_cluster_rendering_client.h"
 #include "modify_object_message.h"
 #include "net_camera.h"
 #include "user_events_provider.h"

+#ifdef WITH_CUDA
+using cgr_libcluster::CudaContextProvider;
+using cgr_libcluster::CUDAContextScope;
+#endif
+using cgr_libcluster::ImageIOUtil;
 using cluster_rendering_net_lib::Retry;
+using OpenImageIO_v2_4::ImageOutput;
+using OpenImageIO_v2_4::TypeDesc;

 using namespace std::chrono;

@ -25,6 +33,7 @@ namespace headless_light_client {
 const std::string LightClusterRenderingClient::IMAGE = "Image";
 const std::string LightClusterRenderingClient::CAMERA = "Camera";
 const std::string LightClusterRenderingClient::JPG = ".jpg";
+const std::string LightClusterRenderingClient::PNG = ".png";
 const double LightClusterRenderingClient::WAIT_LAST_FRAME_TIMEOUT_MS = 1500;

 LightClusterRenderingClient::LightClusterRenderingClient(const Config & config, CameraProvider & camera_provider,
@ -42,16 +51,12 @@ LightClusterRenderingClient::LightClusterRenderingClient(const Config & config,
  image_client_events_handler(IMAGE),
  netlib_camera_client(camera_client_events_handler),
  netlib_image_client(image_client_events_handler)
-#ifdef WITH_OPTIX
  ,encoded_image(INITIAL_SIZE_OF_BUFFER_FOR_ENCODED_IMAGE)
-#endif
 {
 }

 LightClusterRenderingClient::~LightClusterRenderingClient() {
-#ifdef WITH_OPTIX
  video_stream_file.close();
-#endif
 }

 void LightClusterRenderingClient::run() {
@ -69,10 +74,12 @@ void LightClusterRenderingClient::run() {
      camera.cam_width = config.frame_width;
      camera.cam_height = config.frame_height;
    }
-    VLOG(3) << "client send camera for frame: " << camera.frame << " cam_width: " << camera.cam_width << " cam_height: " << camera.cam_height;
+    VLOG(3) << "client send camera for frame: " << camera.frame << " cam_width: " << camera.cam_width <<
+      " cam_height: " << camera.cam_height;
    camera.sampleCount = samples_count;
    camera.master_denoiser = master_denoiser;
    camera.master_image_color_format = config.master_image_color_format;
+    camera.master_image_compressor = config.master_image_compressor;
    if(is_first_camera) {
      camera.frame = 0;
      is_first_camera = false;
@ -127,9 +134,9 @@ void LightClusterRenderingClient::nextCameraDelay(const int frame_id) {
  }
 }

-#ifdef WITH_OPTIX
+#ifdef WITH_CUDA
 void LightClusterRenderingClient::encodeImage(NvEncoder * nv_encoder_ptr,
-      const std::vector<uint8_t> & raw_image, AutoEnlargingBuffer<uint8_t> & encoded_image_out) {
+    const std::vector<uint8_t> & raw_image, std::vector<uint8_t> & encoded_image_out) {
  auto start = high_resolution_clock::now();
  if(nv_encoder_ptr->encode(raw_image.data(), encoded_image_out)) {
    std::chrono::duration<float> duration_sec = high_resolution_clock::now() - start;
@ -140,9 +147,13 @@ void LightClusterRenderingClient::encodeImage(NvEncoder * nv_encoder_ptr,
    throw std::runtime_error(message);
  }
 }
+#endif

 void LightClusterRenderingClient::writeImageToVideoStreamFile(std::ofstream &video_stream_file,
-  const AutoEnlargingBuffer<uint8_t> &encoded_image) {
+    const std::vector<uint8_t> &encoded_image) {
+  if(!video_stream_file.is_open()) {
+    video_stream_file.open(config.save_video_stream_path, std::ios::app | std::ios::binary);
+  }
  if(video_stream_file.is_open()) {
    video_stream_file.write(reinterpret_cast<const char*>(encoded_image.data()), encoded_image.size());
    VLOG(3) << "Wrote encoded image of size: " << encoded_image.size();
@ -151,7 +162,7 @@ void LightClusterRenderingClient::writeImageToVideoStreamFile(std::ofstream &vid
    throw std::invalid_argument(message);
  }
 }
-#endif
+
 // Simple RGB to RGBA convertion.
 // Headless client gets image in jpeg format from the master and decompress it into RGB format
 // This method is used for convertion from RGB to RGBA since NVEncoder expects RGBA flavor
@ -181,6 +192,8 @@ void LightClusterRenderingClient::imageChannelLoop() {
        VLOG(3) << "Image channel received from master buffer size: " << buf_size;
        NetCamera net_camera;
        netlib_image_client.receive(&net_camera, sizeof(NetCamera));
+        // log line below is used by get_metrics.py to calculate stats. If you change it
+        // please make sure get_metrics.py still works correctly. Update if needed.
        VLOG(3) << "net camera received for frame: " << net_camera.frame << " "
          << net_camera.cam_width << " " << net_camera.cam_height;
        std::unique_ptr<std::vector<uint8_t> > image_buffer_uptr(new std::vector<uint8_t>(buf_size));
@ -190,6 +203,7 @@ void LightClusterRenderingClient::imageChannelLoop() {
        received_image_from_master = true;
        ++num_received_frames;
        outputImageIfRequested(image_buffer_uptr.get(), net_camera);
+        outputVideoStreamIfRequested(image_buffer_uptr.get(), net_camera);
        image_cv.notify_one();
      }
    } else {
@ -201,11 +215,13 @@ void LightClusterRenderingClient::imageChannelLoop() {
  VLOG(3) << "Finished image channel thread.";
 }

-void LightClusterRenderingClient::outputImageIfRequested(std::vector<uint8_t> * jpeg_image, const NetCamera & net_camera) {
-  if(config.save_images_path.length() < 1 && config.save_video_stream_path.length() < 1) {
-    return; // no output is requested, return;
+void LightClusterRenderingClient::outputImageIfRequested(std::vector<uint8_t> * image, const NetCamera & net_camera) {
+  if(config.save_images_path.length() < 1) {
+    return; // no image output is requested, return;
  }
-  if(jpeg_image == nullptr) {
+  if(net_camera.master_image_compressor ==
+      ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_JPEG) {
+    if(image == nullptr) {
      std::string message("FATAL. Cannot output an image, a pointer to the image is null");
      VLOG(3) << message;
      throw std::runtime_error(message);
@ -216,34 +232,71 @@ void LightClusterRenderingClient::outputImageIfRequested(std::vector<uint8_t> *
      jpeg_tools_ptr.reset(new JpegTools());
    }
    // Images that we receive from the master are upside down, we need to flip images before saving
-  jpeg_tools_ptr->flipJpegImage(jpeg_image->data(), jpeg_image->size(), &flipped_jpeg, flipped_jpeg_size);
-  if(config.save_images_path.length() > 0) {
+    jpeg_tools_ptr->flipJpegImage(image->data(), image->size(), &flipped_jpeg, flipped_jpeg_size);
      // Saving of noisy images (which are significantly larger due to solt & papper noise)
      // to /mnt/graphics_ssd may take about 30 ms what limits read image thread to about 30 fps.
      // Since we do not want IO operations to affect averall system performance on hight fps
      // we save images in a different thread
-    boost::asio::post(save_image_thread_pool, std::bind(&LightClusterRenderingClient::saveImage, this,
+      boost::asio::post(save_image_thread_pool, std::bind(&LightClusterRenderingClient::saveImageAsIs, this,
        getImagePath(net_camera.frame, JPG), flipped_jpeg, flipped_jpeg_size));
  }
-  if(config.save_video_stream_path.length() > 0) {
-#ifdef WITH_OPTIX
-    if(!nv_encoder_ptr) {
-      // create NVEncoder once we got the first frame with width and height of the image
-      VLOG(3) << "Creating NvEncoder";
-      nv_encoder_ptr.reset(new NvEncoder(NvEncoder::BUFFER_FORMAT_FOR_IMAGES_FROM_MASTER,
-        CudaContext::createCudaContext(),
-        net_camera.cam_width, net_camera.cam_height));
-      video_stream_file.open(config.save_video_stream_path, std::ios::app | std::ios::binary);
+#ifdef WITH_CUDA
+  else if(net_camera.master_image_compressor ==
+      ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_NVENCODER) {
+    if(!nv_decoder_ptr) {
+      VLOG(3) << "Creating nvdecoder";
+        CUcontext cuda_context = CudaContextProvider::getPrimaryContext(CUDA_DEVICE_NUM);
+        const CUDAContextScope scope(cuda_context);
+        nv_decoder_ptr.reset(new NvDecoder(cuda_context));
+    }
+    nv_decoder_ptr->decode(*image, net_camera.frame, &decoded_image);
+    boost::asio::post(save_image_thread_pool, std::bind(&LightClusterRenderingClient::saveImageInFormat, this,
+      PNG, net_camera.frame, decoded_image.data(), net_camera.cam_width, net_camera.cam_height));
  }
 #endif
+  else {
+    VLOG(1) << "Cannot save image, unknown compression: " << net_camera.master_image_compressor;
+  }
+}
+
+void LightClusterRenderingClient::outputVideoStreamIfRequested(std::vector<uint8_t> * image, const NetCamera & net_camera) {
+  if(config.save_video_stream_path.length() < 1) {
+    return; // no video stream output is requested, return;
+  }
+  if(net_camera.master_image_compressor ==
+      ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_JPEG) {
+#ifdef WITH_CUDA
+    if(!nv_encoder_ptr) {
+      VLOG(3) << "Creating NvEncoder";
+      CUcontext cuda_context = CudaContextProvider::getPrimaryContext(CUDA_DEVICE_NUM);
+      const CUDAContextScope scope(cuda_context);
+      nv_encoder_ptr.reset(new NvEncoder(NV_ENC_BUFFER_FORMAT_ABGR,
+        cuda_context,
+        net_camera.cam_width, net_camera.cam_height));
+    }
    std::vector<uint8_t> decompressed_image_rgb(net_camera.cam_width * net_camera.cam_height * 3);
    std::vector<uint8_t> decompressed_image_rgba(net_camera.cam_width * net_camera.cam_height * 4);
+    uint8_t * flipped_jpeg = nullptr;
+    unsigned long flipped_jpeg_size = 0;
+    if(jpeg_tools_ptr == nullptr) {
+      jpeg_tools_ptr.reset(new JpegTools());
+    }
+    // Images that we receive from the master are upside down, we need to flip images before saving
+    jpeg_tools_ptr->flipJpegImage(image->data(), image->size(), &flipped_jpeg, flipped_jpeg_size);
    jpeg_tools_ptr->decompressJpeg(flipped_jpeg, flipped_jpeg_size, decompressed_image_rgb);
    rgbToRgba(decompressed_image_rgb, decompressed_image_rgba);
-#ifdef WITH_OPTIX
    encodeImage(nv_encoder_ptr.get(), decompressed_image_rgba, encoded_image);
    writeImageToVideoStreamFile(video_stream_file, encoded_image);
+#else
+  throw std::runtime_error("ERROR. Client is compiled without CUDA support so has no nvencoder and\
+ cannot encode received image and save video stream");
 #endif
+  } else if(net_camera.master_image_compressor ==
+      ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_NVENCODER) {
+    // image is already video-encoded by nvencoder, save it as is
+    writeImageToVideoStreamFile(video_stream_file, *image);
+  } else {
+    VLOG(1) << "Cannot save video stream, unknown compressor: " << net_camera.master_image_compressor;
  }
 }

@ -254,17 +307,24 @@ std::string LightClusterRenderingClient::getImagePath(const int frame_id, const
  return path_ostream.str();
 }

-void LightClusterRenderingClient::saveImage(const std::string file_path, unsigned char * jpeg_image, int jpeg_image_size) {
+void LightClusterRenderingClient::saveImageAsIs(const std::string file_path, unsigned char * image, int image_size) {
  VLOG(3) << "Saving image into: " << file_path;
  auto start = high_resolution_clock::now();
  std::ofstream wf(file_path, std::ios::out | std::ios::binary);
-  wf.write(reinterpret_cast<const char*>(jpeg_image), jpeg_image_size);
+  wf.write(reinterpret_cast<const char*>(image), image_size);
  wf.close();
  auto end = high_resolution_clock::now();
  double time_taken = duration_cast<milliseconds>(end - start).count();
  VLOG(3) << "Image saved successfully. Save image time: " << time_taken << " ms";
 }

+void LightClusterRenderingClient::saveImageInFormat(std::string format_extension, int frame_id, void * image,
+    int width, int height) {
+  std::string file_path = getImagePath(frame_id, format_extension);
+  std::unique_ptr<ImageOutput> image_output = std::unique_ptr<ImageOutput>(ImageOutput::create(file_path));
+  ImageIOUtil::saveFrame(file_path, TypeDesc::UCHAR, image_output.get(), image, width, height);
+}
+
 bool LightClusterRenderingClient::connectToMaster() {
  bool connected = false;

--- a/intern/cycles/app/headless_light_client/light_cluster_rendering_client.h
+++ b/intern/cycles/app/headless_light_client/light_cluster_rendering_client.h
@ -11,8 +11,9 @@

 #include "jpeg_tools.h"

-#ifdef WITH_OPTIX
+#ifdef WITH_CUDA
 #include "nv_encoder.h"
+#include "nv_decoder.h"
 #endif

 #include "camera_provider.h"
@ -20,14 +21,15 @@
 #include "netclient.h"
 #include "netlib_event_handler.h"
 #include "rpc_blender_protocol.h"
+#include "./utils/vector_types.h"    // for uchar3

 namespace headless_light_client {

 using cgr_libcluster::ClusterSessionParams;
 using cgr_libcluster::RpcBlenderProtocol;
-#ifdef WITH_OPTIX
+#ifdef WITH_CUDA
+using cgr_libcluster::NvDecoder;
 using cgr_libcluster::NvEncoder;
-using cgr_libcluster::AutoEnlargingBuffer;
 #endif

 class CameraProvider;
@ -51,6 +53,8 @@ private:
  static const std::string IMAGE;
  static const std::string CAMERA;
  static const std::string JPG;
+  static const std::string PNG;
+  static const size_t CUDA_DEVICE_NUM = 0;
  static const size_t MAX_NUM_RETRIES = 5;
  static const size_t RETRY_INTERVAL_MS = 2000;
  static const size_t INITIAL_SIZE_OF_BUFFER_FOR_ENCODED_IMAGE = 50000;
@ -81,11 +85,13 @@ private:
  std::condition_variable image_cv;
  std::atomic<bool> received_image_from_master = false;
  std::unique_ptr<JpegTools> jpeg_tools_ptr;
-#ifdef WITH_OPTIX
+  #ifdef WITH_CUDA
  std::unique_ptr<NvEncoder> nv_encoder_ptr;
-  AutoEnlargingBuffer<uint8_t> encoded_image;
-  std::ofstream video_stream_file;
+  std::unique_ptr<NvDecoder> nv_decoder_ptr;
  #endif
+  std::vector<uint8_t> encoded_image;
+  std::vector<cgr_libcluster::uchar3> decoded_image;
+  std::ofstream video_stream_file;

  boost::asio::thread_pool save_image_thread_pool = boost::asio::thread_pool(4);

@ -99,13 +105,16 @@ private:
  void imageChannelLoop();
  void nextCameraDelay(const int frame_id);
  std::string getImagePath(const int frame_id, const std::string & file_extension);
-  void outputImageIfRequested(uint8_t * jpeg_image, int jpeg_image_size, const NetCamera & net_camera);
-  void outputImageIfRequested(std::vector<uint8_t> * jpeg_image, const NetCamera & net_camera);
-  void saveImage(const std::string file_path, unsigned char * jpeg_image, int jpeg_image_size);
-#ifdef WITH_OPTIX
-  void writeImageToVideoStreamFile(std::ofstream &video_stream_file, const AutoEnlargingBuffer<uint8_t> &encoded_image);
+  void outputVideoStreamIfRequested(std::vector<uint8_t> * image, const NetCamera & net_camera);
+  void outputImageIfRequested(std::vector<uint8_t> * image, const NetCamera & net_camera);
+  void saveImageAsIs(const std::string file_path, unsigned char * image, int image_size);
+  // Saves image in a format specified as a file extention
+  void saveImageInFormat(std::string format_extension, int frame_id, void * image, int width, int height);
+  void writeImageToVideoStreamFile(std::ofstream &video_stream_file,
+    const std::vector<uint8_t> &encoded_image);
+#ifdef WITH_CUDA
  void encodeImage(NvEncoder * nv_encoder_ptr, const std::vector<uint8_t> & raw_image,
-    AutoEnlargingBuffer<uint8_t> & encoded_image_out);
+    std::vector<uint8_t> & encoded_image_out);
 #endif
 };

--- a/intern/cycles/app/headless_light_client/main.cpp
+++ b/intern/cycles/app/headless_light_client/main.cpp
@ -29,8 +29,8 @@ using headless_light_client::LightClusterRenderingClient;
 using headless_light_client::UserEventsProvider;

 static void initLogging(const char* argv0) {
-  setenv("GLOG_v", "3", 1);
-  setenv("GLOG_vmodule", "session_network*=3", 1);
+  //setenv("GLOG_v", "3", 1);
+  //setenv("GLOG_vmodule", "session_network*=3", 1);
  google::InitGoogleLogging(argv0);
  GFLAGS_NAMESPACE::SetCommandLineOption("logtostderr", "1");
 }
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@ -163,6 +163,7 @@ class  CYCLES_RENDER_PT_render_session_mode(CyclesButtonsPanel, Panel):
        netsub.enabled = net.enabled and render.render_session_mode == 'MASTER'
        netsub.prop(render, "num_servers")
        netport.prop(render, "master_image_color_format")
+        netport.prop(render, "master_image_compressor")
        #Temp turned off. TODO: [pmishchuk] enable when ready
        # import _cycles
        # if _cycles.with_webrtc:
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@ -871,6 +871,13 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
    // use default color format.
    master_image_color_format_num = cgr_libcluster::ClusterSessionParams::DEFAULT_MASTER_IMAGE_COLOR_FORMAT;
  }
+  int master_image_compressor_num = r.master_image_compressor();
+  if (master_image_compressor_num <= cgr_libcluster::ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_NONE ||
+      master_image_compressor_num >= cgr_libcluster::ClusterSessionParams::MasterImageCompressor::MASTER_IMAGE_COMPRESSOR_END) {
+    // Blend file has no saved value for image compressor (it was saved before we added this property)
+    // use default color format.
+    master_image_compressor_num = cgr_libcluster::ClusterSessionParams::DEFAULT_MASTER_IMAGE_COMPRESSOR;
+  }

  params.cluster_session_params = cgr_libcluster::ClusterSessionParams(
    static_cast<cgr_libcluster::ClusterSessionParams::SessionMode>(r.render_session_mode()),
@ -878,7 +885,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
    static_cast<cgr_libcluster::ClusterSessionParams::MasterDenoiser>(r.master_denoiser()),
    r.save_denoise_io(), r.save_streamed_image(), save_every_n_images,
    r.save_cameras(), r.filepath(),
-    static_cast<cgr_libcluster::ClusterSessionParams::MasterImageColorFormat>(master_image_color_format_num));
+    static_cast<cgr_libcluster::ClusterSessionParams::MasterImageColorFormat>(master_image_color_format_num),
+    static_cast<cgr_libcluster::ClusterSessionParams::MasterImageCompressor>(master_image_compressor_num));
  params.cluster_session_params.fps = r.fps();

  cgr_libcluster::ModifyObjectParams & modify_object_params = params.cluster_session_params.modify_object_params;
--- a/intern/cycles/cluster_rendering/CMakeLists.txt
+++ b/intern/cycles/cluster_rendering/CMakeLists.txt
@ -1,5 +1,5 @@
+if(WITH_CYCLES_DEVICE_CUDA)
+  add_subdirectory(libcluster_cuda_kernels)
+endif()
 add_subdirectory(libcluster)
 add_subdirectory(libnetwork)
-if(WITH_WEBRTC)
-  add_subdirectory(libstream)
-endif(WITH_WEBRTC)
--- a/intern/cycles/cluster_rendering/libcluster/CMakeLists.txt
+++ b/intern/cycles/cluster_rendering/libcluster/CMakeLists.txt
@ -9,7 +9,9 @@ add_subdirectory(test)
 #SET(OIDN_FOLDER ${OIDN_PATH}/oidn.linux)
 #endif()

-find_package(Boost 1.48 COMPONENTS serialization REQUIRED)
+# From William: find_package(Boost 1.48 COMPONENTS serialization REQUIRED)
+# Does not work on Windows at the moment, so turning this REQUIRED off
+find_package(Boost 1.48 COMPONENTS serialization)
 find_package(TurboJPEG REQUIRED)
 # Uncomment instead of the above if using VCPKG
 # find_package(libjpeg-turbo REQUIRED PATHS)
@ -54,34 +56,52 @@ set(SRC
  net_client.cpp
  rpc_blender_protocol.cpp
  image_io_util.cpp
+  compression/turbojpeg_compressor.cpp
  utils/timer.cpp
  utils/vector_types.cpp
+  utils/image.cpp
  denoising/denoise_buffer.cpp
  denoising/denoising_context.cpp
  denoising/master_denoiser.cpp
  denoising/master_oidn_denoiser.cpp
 )

+if(WITH_CYCLES_DEVICE_OPTIX)
+  list(APPEND SRC denoising/master_optix_denoiser.cpp)
+endif()
+
 set(FLAT_BUFFER_FILES
  net_camera.fbs
 )
 function(FLATBUFFERS_GENERATE_C_HEADERS Name)
  if(NOT FLATBUFFERS_FLATC_EXECUTABLE)
    set(FLATBUFFERS_FLATC_EXECUTABLE $<TARGET_FILE:flatc>)
-    if(NOT WIN32)
-      set(FLATBUFFERS_FLATC_EXECUTABLE  ${CMAKE_BINARY_DIR}/bin/flatc)
-      message(WARNING "Using flatc binary FLATBUFFERS_FLATC_EXECUTABLE")
-    else()
-      # There seems to be a bug on Windows that  when the CMAKE_BUILD_TYPE is undefined the
-      # CMAKE_BUILD_TYPE_INIT is set to "Debug" when it should be "Release"
-      message("Foo is located at:${CONFIG}")
-      #      if($<CONFIG:Debug>)
-      set(FLATBUFFERS_FLATC_EXECUTABLE  $<TARGET_FILE:flatc>) # ${CMAKE_BINARY_DIR}/bin/Debug/flatc.exe)
+#    if(NOT WIN32)
+#      #if(APPLE)
+#        # There seems to be a bug on Windows that  when the CMAKE_BUILD_TYPE is undefined the
+#        # CMAKE_BUILD_TYPE_INIT is set to "Debug" when it should be "Release"
+#        message("Foo is located at:${CONFIG}")
+#        #      if($<CONFIG:Debug>)
+#        set(FLATBUFFERS_FLATC_EXECUTABLE  $<TARGET_FILE:flatc>) # ${CMAKE_BINARY_DIR}/bin/Debug/flatc)
+#        #else()
+#        #  set(FLATBUFFERS_FLATC_EXECUTABLE  ${CMAKE_BINARY_DIR}/bin/Release/flatc)
+#        #endif()
+#      message(WARNING "Using MacOS flatc binary:${FLATBUFFERS_FLATC_EXECUTABLE}")
+#      #else()
+#        set(FLATBUFFERS_FLATC_EXECUTABLE  ${CMAKE_BINARY_DIR}/bin/flatc)
+#        message(WARNING "Using flatc binary FLATBUFFERS_FLATC_EXECUTABLE")
+#      #endif()
 #    else()
-      #  set(FLATBUFFERS_FLATC_EXECUTABLE  ${CMAKE_BINARY_DIR}/bin/Release/flatc.exe)
+#      # There seems to be a bug on Windows that  when the CMAKE_BUILD_TYPE is undefined the
+#      # CMAKE_BUILD_TYPE_INIT is set to "Debug" when it should be "Release"
+#      message("Foo is located at:${CONFIG}")
+#      #      if($<CONFIG:Debug>)
+#      set(FLATBUFFERS_FLATC_EXECUTABLE  $<TARGET_FILE:flatc>) # ${CMAKE_BINARY_DIR}/bin/Debug/flatc.exe)
+#      #else()
+#      #  set(FLATBUFFERS_FLATC_EXECUTABLE  ${CMAKE_BINARY_DIR}/bin/Release/flatc.exe)
+#      #endif()
+#      message(WARNING "Using Windows flatc binary:${FLATBUFFERS_FLATC_EXECUTABLE} for build type:${CMAKE_BUILD_TYPE} ${CMAKE_BUILD_TYPE_INIT}")
 #    endif()
-      message(WARNING "Using Windows flatc binary:${FLATBUFFERS_FLATC_EXECUTABLE} for build type:${CMAKE_BUILD_TYPE} ${CMAKE_BUILD_TYPE_INIT}")
-    endif()
  endif()
  set(FLATC_OUTPUTS)
  foreach(FILE ${ARGN})
@ -140,6 +160,7 @@ set(SRC_HEADERS
  utils/logging.h
  utils/timer.h
  utils/vector_types.h
+  utils/image.h
  denoising/denoise_buffer.h
  denoising/denoising_context.h
  denoising/master_denoiser.h
@ -152,22 +173,31 @@ set(LIB
  ${Boost_LIBRARIES}
  ${TurboJPEG_LIBRARIES}
  flatbuffers
+#  rt
 )

-if(WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX)
-  message(STATUS "Using CUDA adding nv_encoder")
+if(WITH_CYCLES_DEVICE_OPTIX)
  list(APPEND INC ${OPTIX_INCLUDE_DIRS})
-  list(APPEND SRC denoising/master_optix_denoiser.cpp compression/nv_encoder.cpp)
-  list(APPEND SRC_HEADERS denoising/master_optix_denoiser.h compression/nv_encoder.h)
-  if(WITH_CUDA_DYNLOAD)
-    list(APPEND LIB
-      extern_cuew
-    )
-  else()
-    list(APPEND LIB
-      ${CUDA_CUDA_LIBRARY}
-    )
 endif()
+
+if(WITH_CYCLES_DEVICE_CUDA AND UNIX)
+  message(STATUS "Building with CUDA, adding nv_encoder to the libcluster")
+  find_library(NVENCODEAPI_LIB nvidia-encode)
+  find_library(NVCUVID_LIB nvcuvid HINTS "/lib64/") # the same folder on dev laptop and docker image
+  list(APPEND SRC compression/nv_encoder.cpp)
+  list(APPEND SRC_HEADERS compression/nv_encoder.h)
+  list(APPEND SRC compression/nv_decoder.cpp)
+  list(APPEND SRC_HEADERS compression/nv_decoder.h)
+  list(APPEND LIB cycles_libcluster_cuda_kernels)
+  list(APPEND LIB ${NVENCODEAPI_LIB})
+  list(APPEND LIB ${NVCUVID_LIB})
+  if(WITH_CUDA_DYNLOAD)
+    list(APPEND LIB extern_cuew)
+  else()
+    list(APPEND LIB ${CUDA_CUDA_LIBRARY})
+  endif()
+else()
+  message(STATUS "No CUDA or we are not on Linux so building libcluster without nv_encoder")
 endif()

 #if(APPLE)
--- a/intern/cycles/cluster_rendering/libcluster/cluster_session_params.h
+++ b/intern/cycles/cluster_rendering/libcluster/cluster_session_params.h
@ -61,12 +61,24 @@ public:
    // END must go last
    MASTER_IMAGE_COLOR_FORMAT_END,
  };
+
+  enum MasterImageCompressor{
+    MASTER_IMAGE_COMPRESSOR_NONE = 0,
+    // Add new options here between _NONE and _END
+    // List of formats must be in sync with list in DNA_scene_types.h
+    MASTER_IMAGE_COMPRESSOR_JPEG = 1,
+    MASTER_IMAGE_COMPRESSOR_NVENCODER = 2,
+    // END must go last
+    MASTER_IMAGE_COMPRESSOR_END,
+  };
+
  static const MasterImageColorFormat DEFAULT_MASTER_IMAGE_COLOR_FORMAT = MASTER_IMAGE_COLOR_FORMAT_LINEAR;
+  static const MasterImageCompressor DEFAULT_MASTER_IMAGE_COMPRESSOR = MASTER_IMAGE_COMPRESSOR_JPEG;

  ClusterSessionParams(SessionMode session_mode, std::string master_address, int master_port, int num_servers,
      int master_compression_quality, MasterDenoiser master_denoiser, bool save_denoise_io, bool save_streamed_image,
      int save_every_n_images, bool save_cameras, const std::string & output_folder_path,
-      MasterImageColorFormat master_image_color_format) :
+      MasterImageColorFormat master_image_color_format, MasterImageCompressor master_image_compressor) :
    session_mode(session_mode),
    master_address(master_address),
    master_port(master_port),
@ -74,6 +86,7 @@ public:
    master_compression_quality(master_compression_quality),
    master_denoiser(master_denoiser),
    master_image_color_format(master_image_color_format),
+    master_image_compressor(master_image_compressor),
    save_denoise_io(save_denoise_io),
    save_streamed_image(save_streamed_image),
    save_every_n_images(save_every_n_images),
@ -89,6 +102,7 @@ public:
    master_compression_quality = 85;
    master_denoiser = MASTER_DENOISER_NONE;
    master_image_color_format = MASTER_IMAGE_COLOR_FORMAT_LINEAR;
+    master_image_compressor = MASTER_IMAGE_COMPRESSOR_JPEG;
  }

  bool modified(const ClusterSessionParams &params) const
@ -103,6 +117,7 @@ public:
             save_every_n_images == params.save_every_n_images &&
             save_cameras == params.save_cameras &&
             master_image_color_format == params.master_image_color_format &&
+             master_image_compressor == params.master_image_compressor &&
             output_folder_path == params.output_folder_path);
    // modified method should not compare modify_object_params.
    // When modified method returns true a session is restarted.
@ -116,6 +131,7 @@ public:
  int master_compression_quality;
  MasterDenoiser master_denoiser;
  MasterImageColorFormat master_image_color_format;
+  MasterImageCompressor master_image_compressor;
  bool save_denoise_io = false;
  bool save_streamed_image = false;
  // When save_every_n_images is 10, we only save 0th, 10th, 20th, 30th images
--- a/intern/cycles/cluster_rendering/libcluster/compression/nv_decoder.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/compression/nv_decoder.cpp
@ -0,0 +1,262 @@
+
+#include <chrono>
+#include <cmath>
+#include <stdexcept>
+#include <string>
+
+#include "nv_decoder.h"
+#include "../cuda_context_provider.h"
+#include "../utils/cuda_utils.h"
+#include "../utils/image.h"
+
+namespace cgr_libcluster {
+
+static const char * getVideoCodecString(cudaVideoCodec video_codec) {
+  static struct {
+    cudaVideoCodec codec;
+    const char *name;
+  } CodecToName [] = {
+    { cudaVideoCodec_MPEG1,     "MPEG-1"       },
+    { cudaVideoCodec_MPEG2,     "MPEG-2"       },
+    { cudaVideoCodec_MPEG4,     "MPEG-4 (ASP)" },
+    { cudaVideoCodec_VC1,       "VC-1/WMV"     },
+    { cudaVideoCodec_H264,      "AVC/H.264"    },
+    { cudaVideoCodec_JPEG,      "M-JPEG"       },
+    { cudaVideoCodec_H264_SVC,  "H.264/SVC"    },
+    { cudaVideoCodec_H264_MVC,  "H.264/MVC"    },
+    { cudaVideoCodec_HEVC,      "H.265/HEVC"   },
+    { cudaVideoCodec_VP8,       "VP8"          },
+    { cudaVideoCodec_VP9,       "VP9"          },
+    { cudaVideoCodec_AV1,       "AV1"          },
+    { cudaVideoCodec_NumCodecs, "Invalid"      },
+    { cudaVideoCodec_YUV420,    "YUV  4:2:0"   },
+    { cudaVideoCodec_YV12,      "YV12 4:2:0"   },
+    { cudaVideoCodec_NV12,      "NV12 4:2:0"   },
+    { cudaVideoCodec_YUYV,      "YUYV 4:2:2"   },
+    { cudaVideoCodec_UYVY,      "UYVY 4:2:2"   },
+  };
+  if (video_codec >= 0 && video_codec <= cudaVideoCodec_NumCodecs) {
+    return CodecToName[video_codec].name;
+  }
+  for (int i = cudaVideoCodec_NumCodecs + 1; i < sizeof(CodecToName) / sizeof(CodecToName[0]); i++) {
+    if (video_codec == CodecToName[i].codec) {
+      return CodecToName[video_codec].name;
+    }
+  }
+  return "Unknown";
+}
+
+static const char * getVideoChromaFormatString(cudaVideoChromaFormat chroma_format) {
+  static struct {
+    cudaVideoChromaFormat chroma_format;
+    const char *name;
+  } ChromaFormatToName[] = {
+    { cudaVideoChromaFormat_Monochrome, "YUV 400 (Monochrome)" },
+    { cudaVideoChromaFormat_420,        "YUV 420"              },
+    { cudaVideoChromaFormat_422,        "YUV 422"              },
+    { cudaVideoChromaFormat_444,        "YUV 444"              },
+  };
+  if (chroma_format >= 0 && chroma_format < sizeof(ChromaFormatToName) / sizeof(ChromaFormatToName[0])) {
+    return ChromaFormatToName[chroma_format].name;
+  }
+  return "Unknown";
+}
+
+static float getChromaHeightFactor(cudaVideoSurfaceFormat surface_format) {
+  float factor = 0.5;
+  switch (surface_format) {
+    case cudaVideoSurfaceFormat_NV12:
+    case cudaVideoSurfaceFormat_P016:
+          factor = 0.5;
+          break;
+    case cudaVideoSurfaceFormat_YUV444:
+    case cudaVideoSurfaceFormat_YUV444_16Bit:
+          factor = 1.0;
+          break;
+  }
+  return factor;
+}
+
+static int getChromaPlaneCount(cudaVideoSurfaceFormat surface_format) {
+  int numPlane = 1;
+  switch (surface_format) {
+    case cudaVideoSurfaceFormat_NV12:
+    case cudaVideoSurfaceFormat_P016:
+          numPlane = 1;
+          break;
+    case cudaVideoSurfaceFormat_YUV444:
+    case cudaVideoSurfaceFormat_YUV444_16Bit:
+          numPlane = 2;
+          break;
+  }
+  return numPlane;
+}
+
+NvDecoder::NvDecoder(CUcontext cuda_context):
+    m_cuda_context(cuda_context) {
+  LOG(INFO) << "NvDecoder constructor. Creating video parser";
+  CUVIDPARSERPARAMS videoParserParameters = {};
+  videoParserParameters.CodecType = cudaVideoCodec_H264;
+  videoParserParameters.ulMaxNumDecodeSurfaces = 1;
+  videoParserParameters.ulClockRate = 1000;
+  constexpr int low_latency_display_delay = 0;
+  videoParserParameters.ulMaxDisplayDelay = low_latency_display_delay;
+  videoParserParameters.pUserData = this;
+  videoParserParameters.pfnSequenceCallback = HandleVideoSequenceCallback;
+  videoParserParameters.pfnDecodePicture = HandlePictureDecodeCallback;
+  videoParserParameters.pfnDisplayPicture = HandlePictureDisplayCallback;
+  CUDA_API_CALL(cuvidCreateVideoParser(&m_parser, &videoParserParameters), THROW_IF_ERROR);
+  LOG(INFO) << "Created video parser";
+}
+
+NvDecoder::~NvDecoder() {
+  LOG(INFO) << "NvDecoder destructor";
+  if (m_parser) {
+    LOG(INFO) << "Destroying video parser";
+    CUDA_API_CALL(cuvidDestroyVideoParser(m_parser), DO_NOT_THROW);
+  }
+  if (m_decoder) {
+    LOG(INFO) << "Destroying video decoder";
+    CUDA_API_CALL(cuvidDestroyDecoder(m_decoder), DO_NOT_THROW);
+  }
+  LOG(INFO) << "NvDecoder released resources and is destroyed";
+}
+
+// Return values:
+// 0 :  fail
+// 1 :  succeeded, but driver should not override CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces
+// >1:  succeeded, and driver should override CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces with this return value
+int NvDecoder::HandleVideoSequence(CUVIDEOFORMAT *vdeo_format){
+  LOG(INFO) << "HandleVideoSequence callback. Creating nvdecoder";
+  LOG(INFO) << "Video Input Information:";
+  LOG(INFO) << "\tCodec        : "  << getVideoCodecString(vdeo_format->codec);
+  LOG(INFO) << "\tSequence     : "  << (vdeo_format->progressive_sequence ? "Progressive" : "Interlaced");
+  LOG(INFO) << "\tCoded size   : [" << vdeo_format->coded_width << ", " << vdeo_format->coded_height << "]";
+  LOG(INFO) << "\tDisplay area : [" << vdeo_format->display_area.left << ", " << vdeo_format->display_area.top << ", "
+            << vdeo_format->display_area.right << ", " << vdeo_format->display_area.bottom << "]";
+  LOG(INFO) << "\tChroma       : "  << getVideoChromaFormatString(vdeo_format->chroma_format);
+  LOG(INFO) << "\tBit depth    : "  << vdeo_format->bit_depth_luma_minus8 + 8;
+
+  CUVIDDECODECREATEINFO video_decode_create_info = { 0 };
+  video_decode_create_info.CodecType = vdeo_format->codec;
+  video_decode_create_info.ChromaFormat = vdeo_format->chroma_format;
+  video_decode_create_info.OutputFormat = cudaVideoSurfaceFormat_NV12;
+  video_decode_create_info.bitDepthMinus8 = vdeo_format->bit_depth_luma_minus8;
+  video_decode_create_info.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
+  video_decode_create_info.ulNumOutputSurfaces = 2;
+  video_decode_create_info.ulCreationFlags = cudaVideoCreate_PreferCUVID;
+  // This is how nvidia recommends calculating ulNumDecodeSurfaces here:
+  // https://developer.nvidia.com/blog/optimizing-video-memory-usage-with-the-nvdecode-api-and-nvidia-video-codec-sdk/
+  video_decode_create_info.ulNumDecodeSurfaces = vdeo_format->min_num_decode_surfaces + 3;
+  video_decode_create_info.ulWidth = vdeo_format->coded_width;
+  video_decode_create_info.ulHeight = vdeo_format->coded_height;
+  video_decode_create_info.ulMaxWidth = video_decode_create_info.ulWidth;
+  video_decode_create_info.ulMaxHeight = video_decode_create_info.ulHeight;
+  video_decode_create_info.ulTargetWidth = video_decode_create_info.ulWidth;
+  video_decode_create_info.ulTargetHeight = video_decode_create_info.ulHeight;
+
+  m_image_width_in_pixels = vdeo_format->display_area.right - vdeo_format->display_area.left;
+  // NV12/P016 output format width is 2 byte aligned because of U and V interleave
+  if (m_output_format == cudaVideoSurfaceFormat_NV12 ||
+      m_output_format == cudaVideoSurfaceFormat_P016) {
+    m_image_width_in_pixels = (m_image_width_in_pixels + 1) & ~1;
+  }
+  m_luma_height = vdeo_format->display_area.bottom - vdeo_format->display_area.top;
+  m_bytes_per_pixel = video_decode_create_info.bitDepthMinus8 > 0 ? 2 : 1;
+  m_chroma_height = (int)(std::ceil(m_luma_height * getChromaHeightFactor(video_decode_create_info.OutputFormat)));
+  m_num_chroma_planes = getChromaPlaneCount(video_decode_create_info.OutputFormat);
+  m_surface_height = video_decode_create_info.ulTargetHeight;
+  const int size_ofdecoded_image_yuv_format_in_bytes = m_image_width_in_pixels *
+    (m_luma_height + (m_chroma_height * m_num_chroma_planes)) * m_bytes_per_pixel;
+  m_decoded_image_yuv.resize(size_ofdecoded_image_yuv_format_in_bytes);
+
+  CUDAContextScope cuda_context_scope(m_cuda_context);
+  CUDA_API_CALL(cuvidCreateDecoder(&m_decoder, &video_decode_create_info), THROW_IF_ERROR);
+  return video_decode_create_info.ulNumDecodeSurfaces;
+}
+
+void NvDecoder::decode(const std::vector<uint8_t> & encoded_image_in, const int frame_id,
+    std::vector<cgr_libcluster::uchar3> * decoded_image_out_ptr) {
+  if(!decoded_image_out_ptr) {
+    std::string message = "Pointer to decoded image buffer is null. There is no place for output image, break.";
+    LOG(ERROR) << message;
+    throw std::invalid_argument(message);
+  }
+  m_decoded_image_rgb_out_ptr = decoded_image_out_ptr;
+  CUVIDSOURCEDATAPACKET packet = { 0 };
+  packet.payload = encoded_image_in.data();
+  packet.payload_size = encoded_image_in.size();
+  packet.flags = CUVID_PKT_ENDOFPICTURE | CUVID_PKT_TIMESTAMP;
+  packet.timestamp = 0;
+  if (encoded_image_in.size() == 0) {
+    packet.flags |= CUVID_PKT_ENDOFSTREAM;
+  }
+  CUDA_API_CALL(cuvidParseVideoData(m_parser, &packet), THROW_IF_ERROR);
+}
+
+// 0: fail, >=1: succeeded
+int NvDecoder::HandlePictureDecode(CUVIDPICPARAMS *pic_params) {
+  CUDAContextScope cuda_context_scope(m_cuda_context);
+  CUDA_API_CALL(cuvidDecodePicture(m_decoder, pic_params), THROW_IF_ERROR);
+  return 1;
+}
+
+//  0: fail; >=1: succeeded
+int NvDecoder::HandlePictureDisplay(CUVIDPARSERDISPINFO *parser_disp_info) {
+  CUVIDPROCPARAMS video_processing_parameters = {};
+  video_processing_parameters.progressive_frame = parser_disp_info->progressive_frame;
+  video_processing_parameters.second_field = parser_disp_info->repeat_first_field + 1;
+  video_processing_parameters.top_field_first = parser_disp_info->top_field_first;
+  video_processing_parameters.unpaired_field = parser_disp_info->repeat_first_field < 0;
+  video_processing_parameters.output_stream = m_cuvid_stream;
+
+  CUdeviceptr src_frame_device_ptr = 0;
+  unsigned int src_pitch = 0;
+  CUDAContextScope cuda_context_scope(m_cuda_context);
+  CUDA_API_CALL(cuvidMapVideoFrame(m_decoder, parser_disp_info->picture_index, &src_frame_device_ptr,
+    &src_pitch, &video_processing_parameters), THROW_IF_ERROR);
+
+  CUVIDGETDECODESTATUS decode_status;
+  memset(&decode_status, 0, sizeof(decode_status));
+  CUresult result = cuvidGetDecodeStatus(m_decoder, parser_disp_info->picture_index, &decode_status);
+  if (result == CUDA_SUCCESS &&
+      (decode_status.decodeStatus == cuvidDecodeStatus_Error ||
+      decode_status.decodeStatus == cuvidDecodeStatus_Error_Concealed)) {
+    LOG(INFO) << "Image decoding failed with status: " << decode_status.decodeStatus;
+  }
+  uint8_t *decoded_image_yuv_ptr = m_decoded_image_yuv.data();
+  // Copy luma plane
+  CUDA_MEMCPY2D mem_cpy_2d = { 0 };
+  mem_cpy_2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+  mem_cpy_2d.srcDevice = src_frame_device_ptr;
+  mem_cpy_2d.srcPitch = src_pitch;
+  mem_cpy_2d.dstMemoryType = CU_MEMORYTYPE_HOST;
+  mem_cpy_2d.dstDevice = (CUdeviceptr)(mem_cpy_2d.dstHost = decoded_image_yuv_ptr);
+  mem_cpy_2d.dstPitch = m_device_frame_pitch ? m_device_frame_pitch : m_image_width_in_pixels * m_bytes_per_pixel;
+  mem_cpy_2d.WidthInBytes = m_image_width_in_pixels * m_bytes_per_pixel;
+  mem_cpy_2d.Height = m_luma_height;
+  CUDA_API_CALL(cuMemcpy2DAsync(&mem_cpy_2d, m_cuvid_stream), THROW_IF_ERROR);
+  // Copy chroma plane
+  // NVDEC output has luma height aligned by 2. Adjust chroma offset by aligning height
+  mem_cpy_2d.srcDevice = (CUdeviceptr)((uint8_t *)src_frame_device_ptr + mem_cpy_2d.srcPitch * ((m_surface_height + 1) & ~1));
+  mem_cpy_2d.dstDevice = (CUdeviceptr)(mem_cpy_2d.dstHost = decoded_image_yuv_ptr + mem_cpy_2d.dstPitch * m_luma_height);
+  mem_cpy_2d.Height = m_chroma_height;
+  CUDA_API_CALL(cuMemcpy2DAsync(&mem_cpy_2d, m_cuvid_stream), THROW_IF_ERROR);
+
+  if (m_num_chroma_planes == 2) {
+    mem_cpy_2d.srcDevice = (CUdeviceptr)((uint8_t *)src_frame_device_ptr + mem_cpy_2d.srcPitch * ((m_surface_height + 1) & ~1) * 2);
+    mem_cpy_2d.dstDevice = (CUdeviceptr)(mem_cpy_2d.dstHost = decoded_image_yuv_ptr + mem_cpy_2d.dstPitch * m_luma_height * 2);
+    mem_cpy_2d.Height = m_chroma_height;
+    CUDA_API_CALL(cuMemcpy2DAsync(&mem_cpy_2d, m_cuvid_stream), THROW_IF_ERROR);
+  }
+  CUDA_API_CALL(cuStreamSynchronize(m_cuvid_stream), THROW_IF_ERROR);
+  CUDA_API_CALL(cuvidUnmapVideoFrame(m_decoder, src_frame_device_ptr), THROW_IF_ERROR);
+  const int num_pixels = m_image_width_in_pixels * m_luma_height;
+  if(m_decoded_image_rgb_out_ptr->size() != num_pixels) {
+    m_decoded_image_rgb_out_ptr->resize(num_pixels);
+  }
+  yuv2Rgb(decoded_image_yuv_ptr, m_image_width_in_pixels, m_luma_height, m_decoded_image_rgb_out_ptr);
+  return 1;
+}
+
+}
--- a/intern/cycles/cluster_rendering/libcluster/compression/nv_decoder.h
+++ b/intern/cycles/cluster_rendering/libcluster/compression/nv_decoder.h
@ -0,0 +1,81 @@
+#ifndef __NV_DECODER_H__
+#define __NV_DECODER_H__
+
+#ifdef WITH_CUDA_DYNLOAD
+  #include <cuew.h>
+  // Do not use CUDA SDK headers when using CUEW
+  // The macro below is used by Optix SDK and is necessary to avoid DSO loading collision
+  // See device_optix.cpp for example.
+  #define OPTIX_DONT_INCLUDE_CUDA
+#else
+  #include <cuda.h>
+#endif
+
+#include "nvcuvid.h"
+
+#include "../utils/logging.h"
+#include "../utils/vector_types.h"    // for uchar3
+
+namespace cgr_libcluster {
+
+class NvDecoder {
+public:
+
+  NvDecoder(CUcontext cuda_context);
+  ~NvDecoder();
+
+  // Decodes input image, converts into rgb format and puts into decoded_image_out_ptr
+  // Decoder resizes output vector so it can accommodate a decoded image if needed
+  void decode(const std::vector<uint8_t> & encoded_image_in, const int frame_id,
+    std::vector<cgr_libcluster::uchar3> * decoded_image_out_ptr);
+
+private:
+  const CUcontext m_cuda_context;
+
+  CUvideoparser m_parser = nullptr;
+  CUvideodecoder m_decoder = nullptr;
+  CUstream m_cuvid_stream = 0;
+
+  unsigned int m_image_width_in_pixels = 0;
+  unsigned int m_luma_height = 0;
+  unsigned int m_chroma_height = 0;
+  unsigned int m_num_chroma_planes = 0;
+  int m_bytes_per_pixel = 1;
+  size_t m_device_frame_pitch = 0;
+  int m_surface_height = 0;
+  cudaVideoSurfaceFormat m_output_format = cudaVideoSurfaceFormat_NV12;
+  std::vector<uint8_t> m_decoded_image_yuv;
+  std::vector<cgr_libcluster::uchar3> * m_decoded_image_rgb_out_ptr = nullptr;
+
+  // Callback function to be registered for getting a callback when decoding of sequence starts
+  static int CUDAAPI HandleVideoSequenceCallback(void *pUserData, CUVIDEOFORMAT *pVideoFormat) {
+    return ((NvDecoder *)pUserData)->HandleVideoSequence(pVideoFormat);
+  }
+
+  // Callback function to be registered for getting a callback when a decoded frame is ready to be decoded
+  static int CUDAAPI HandlePictureDecodeCallback(void *pUserData, CUVIDPICPARAMS *pPicParams) {
+    return ((NvDecoder *)pUserData)->HandlePictureDecode(pPicParams);
+  }
+
+  // Callback function to be registered for getting a callback when a decoded frame is available for display
+  static int CUDAAPI HandlePictureDisplayCallback(void *pUserData, CUVIDPARSERDISPINFO *pDispInfo) {
+    return ((NvDecoder *)pUserData)->HandlePictureDisplay(pDispInfo);
+  }
+
+  // This function gets called when a sequence is ready to be decoded. The function also gets called
+  // when there is format change. It inits video decoder
+  int HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat);
+
+  // This function gets called when a picture is ready to be decoded. cuvidDecodePicture is called from
+  // this function to decode the picture
+  int HandlePictureDecode(CUVIDPICPARAMS *pPicParams);
+
+  // This function gets called after a picture is decoded and available for display. Frames are fetched
+  // and stored in internal buffer
+  int HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo);
+
+};
+
+}
+
+#endif
--- a/intern/cycles/cluster_rendering/libcluster/compression/nv_encoder.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/compression/nv_encoder.cpp
@ -6,6 +6,7 @@
 #include <glog/logging.h>

 #include "nv_encoder.h"
+#include "../cuda_context_provider.h"

 namespace cgr_libcluster {

@ -24,22 +25,56 @@ NvEncoder::NvEncoder(NV_ENC_BUFFER_FORMAT buffer_format,
 }

 NvEncoder::~NvEncoder() {
+  VLOG(3) << "NvEncoder destructor";
  if(m_encoder_session_handle.get()) {
-    nvencode_api_function_list.nvEncDestroyBitstreamBuffer(m_encoder_session_handle.get(), m_bitstream_output_buffer.get());
-    nvencode_api_function_list.nvEncUnregisterResource(m_encoder_session_handle.get(), m_input_buffer_registration.get());
-    nvencode_api_function_list.nvEncDestroyEncoder(m_encoder_session_handle.get());
+    NV_ENC_PIC_PARAMS end_of_input_stream_pic_arams = {NV_ENC_PIC_PARAMS_VER};
+    end_of_input_stream_pic_arams.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
+    NVENCSTATUS nv_error_code = m_nvencode_api_function_list.nvEncEncodePicture(
+      m_encoder_session_handle.get(), &end_of_input_stream_pic_arams);
+    if(nv_error_code != NV_ENC_SUCCESS) {
+      std::string message = "FATAL. nvEncEncodePicture call with end of stream failed with error: "
+        + std::to_string(nv_error_code);
+      VLOG(1) << message;
+    } else {
+      VLOG(3) << "Communicated end of stream to the NvEncoder";
    }
+
+    nv_error_code = m_nvencode_api_function_list.nvEncUnregisterResource(m_encoder_session_handle.get(),
+      m_input_buffer_registration.get());
+    if(nv_error_code != NV_ENC_SUCCESS) {
+      std::string message = "FATAL. NvEncoder input biffer un-registration failed with error: "
+        + std::to_string(nv_error_code);
+      VLOG(1) << message;
+    } else {
+      VLOG(3) << "Unregistered NvEncoder input biffer successfully";
+    }
+
+    nv_error_code = m_nvencode_api_function_list.nvEncDestroyBitstreamBuffer(m_encoder_session_handle.get(),
+      m_bitstream_output_buffer.get());
+    if(nv_error_code != NV_ENC_SUCCESS) {
+      std::string message = "FATAL. nvEncDestroyBitstreamBuffer failed with error: "
+        + std::to_string(nv_error_code);
+      VLOG(1) << message;
+    } else {
+      VLOG(3) << "Destroyed NvEncoder bitstream buffer successfully";
+    }
+    m_nvencode_api_function_list.nvEncDestroyEncoder(m_encoder_session_handle.get());
+    VLOG(3) << "Destroyed NvEncoder session successfully";
+  }
+
  if(m_cuda_context) {
-    cuCtxPushCurrent(m_cuda_context);
+    CUDAContextScope cuda_context_scope(m_cuda_context);
    cuMemFree(m_cuda_device_ptr.get());
-    cuCtxPopCurrent(NULL);
+    VLOG(3) << "Released cuda input buffer successfully";
  }
+
  VLOG(3) << "NvEncoder released resources and is destroyed";
 }

 void NvEncoder::initNvEncoder() {
  // most of values that are currently set here are taken from the NVIDIA samples NvEncoder.cpp
  // we can adjust them later on if needed
+  VLOG(3) << "Initializing NVENCODER for width: " << m_width << " height: " << m_height;
  uint32_t driver_version = 0;
  uint32_t header_version = (NVENCAPI_MAJOR_VERSION << 4) | NVENCAPI_MINOR_VERSION;
  NVENCSTATUS nv_error_code_v = NvEncodeAPIGetMaxSupportedVersion(&driver_version);
@ -56,15 +91,15 @@ void NvEncoder::initNvEncoder() {
    VLOG(1) << message;
    throw std::runtime_error(message);
  }
-  nvencode_api_function_list = { NV_ENCODE_API_FUNCTION_LIST_VER };
-  NVENCSTATUS nv_error_code = NvEncodeAPICreateInstance(&nvencode_api_function_list);
+  m_nvencode_api_function_list = { NV_ENCODE_API_FUNCTION_LIST_VER };
+  NVENCSTATUS nv_error_code = NvEncodeAPICreateInstance(&m_nvencode_api_function_list);
  if(nv_error_code != NV_ENC_SUCCESS) {
    std::string message = "FATAL. NvEncodeAPICreateInstance failed with error: "
      + std::to_string(nv_error_code);
    VLOG(1) << message;
    throw std::runtime_error(message);
  }
-  if(!nvencode_api_function_list.nvEncOpenEncodeSessionEx) {
+  if(!m_nvencode_api_function_list.nvEncOpenEncodeSessionEx) {
    std::string message = "FATAL. nvEncOpenEncodeSessionEx API not found";
    VLOG(1) << message;
    throw std::runtime_error(message);
@ -74,13 +109,13 @@ void NvEncoder::initNvEncoder() {
  encode_session_ex_params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
  encode_session_ex_params.apiVersion = NVENCAPI_VERSION;
  void * encoder_session_handle;
-  nv_error_code = nvencode_api_function_list.nvEncOpenEncodeSessionEx(&encode_session_ex_params, &encoder_session_handle);
+  nv_error_code = m_nvencode_api_function_list.nvEncOpenEncodeSessionEx(&encode_session_ex_params, &encoder_session_handle);
  m_encoder_session_handle.init(encoder_session_handle, __FILE__, __FUNCTION__, __LINE__);
  if(nv_error_code != NV_ENC_SUCCESS) {
    std::string message = "FATAL. nvEncOpenEncodeSessionEx failed with error: "
      + std::to_string(nv_error_code);
    VLOG(1) << message;
-    nvencode_api_function_list.nvEncDestroyEncoder(m_encoder_session_handle.get());
+    m_nvencode_api_function_list.nvEncDestroyEncoder(m_encoder_session_handle.get());
    throw std::runtime_error(message);
  }
  if(!m_encoder_session_handle.get()) {
@ -92,66 +127,66 @@ void NvEncoder::initNvEncoder() {
  // extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvEncodeAPI.h
  // github link:
  // https://ghe.oculus-rep.com/FRL-Graphics-Research/distributed_blender_cycles/blob/cluster_blender_32_main_nvenc/extern/nvidia/Video_Codec_SDK_11.1.5/Interface/nvEncodeAPI.h#L1667
-  NV_ENC_INITIALIZE_PARAMS encoder_init_params = { 0 };
-  encoder_init_params.version = NV_ENC_INITIALIZE_PARAMS_VER;
-  GUID preset = NV_ENC_PRESET_P1_GUID;
-  encoder_init_params.encodeGUID = NV_ENC_CODEC_H264_GUID;
-  encoder_init_params.presetGUID = preset;
-  encoder_init_params.encodeWidth = m_width;
-  encoder_init_params.encodeHeight = m_height;
-  encoder_init_params.darWidth = m_width;
-  encoder_init_params.darHeight = m_height;
-  encoder_init_params.frameRateNum = 30;
-  encoder_init_params.frameRateDen = 1;
-  encoder_init_params.enablePTD = 1;
-  encoder_init_params.reportSliceOffsets = 0;
-  encoder_init_params.enableSubFrameWrite = 0;
-  encoder_init_params.maxEncodeWidth = m_width;
-  encoder_init_params.maxEncodeHeight = m_height;
-  encoder_init_params.enableMEOnlyMode = false;
-  encoder_init_params.enableOutputInVidmem = false;
-  encoder_init_params.tuningInfo = NV_ENC_TUNING_INFO_LOW_LATENCY;
-  //encoder_init_params.tuningInfo = NV_ENC_TUNING_INFO_HIGH_QUALITY;
+  m_encoder_init_params = { 0 };
+  m_encoder_init_params.version = NV_ENC_INITIALIZE_PARAMS_VER;
+  GUID preset_guid = NV_ENC_PRESET_P1_GUID;
+  m_encoder_init_params.encodeGUID = NV_ENC_CODEC_H264_GUID;
+  m_encoder_init_params.presetGUID = preset_guid;
+  m_encoder_init_params.encodeWidth = m_width;
+  m_encoder_init_params.encodeHeight = m_height;
+  m_encoder_init_params.darWidth = m_width;
+  m_encoder_init_params.darHeight = m_height;
+  m_encoder_init_params.frameRateNum = 30;
+  m_encoder_init_params.frameRateDen = 1;
+  m_encoder_init_params.enablePTD = 1;
+  m_encoder_init_params.reportSliceOffsets = 0;
+  m_encoder_init_params.enableSubFrameWrite = 0;
+  m_encoder_init_params.maxEncodeWidth = m_width;
+  m_encoder_init_params.maxEncodeHeight = m_height;
+  m_encoder_init_params.enableMEOnlyMode = false;
+  m_encoder_init_params.enableOutputInVidmem = false;
+  m_encoder_init_params.tuningInfo = NV_ENC_TUNING_INFO_LOW_LATENCY;
+  //m_encoder_init_params.tuningInfo = NV_ENC_TUNING_INFO_HIGH_QUALITY;

-  NV_ENC_CONFIG encode_config = { 0 };
-  encoder_init_params.encodeConfig = &encode_config;
-  NV_ENC_PRESET_CONFIG presetConfig_2 = { NV_ENC_PRESET_CONFIG_VER, { NV_ENC_CONFIG_VER } };
-  nvencode_api_function_list.nvEncGetEncodePresetConfigEx(m_encoder_session_handle.get(),
-    NV_ENC_CODEC_H264_GUID, preset, encoder_init_params.tuningInfo, &presetConfig_2);
-  memcpy(encoder_init_params.encodeConfig, &presetConfig_2.presetCfg, sizeof(NV_ENC_CONFIG));
+  m_encode_config = { 0 };
+  m_encoder_init_params.encodeConfig = &m_encode_config;
+  NV_ENC_PRESET_CONFIG preset_config_ex = { NV_ENC_PRESET_CONFIG_VER, { NV_ENC_CONFIG_VER } };
+  m_nvencode_api_function_list.nvEncGetEncodePresetConfigEx(m_encoder_session_handle.get(),
+    m_encoder_init_params.encodeGUID, preset_guid, m_encoder_init_params.tuningInfo, &preset_config_ex);
+  memcpy(m_encoder_init_params.encodeConfig, &preset_config_ex.presetCfg, sizeof(NV_ENC_CONFIG));

-  encode_config.version = NV_ENC_CONFIG_VER;
-  encode_config.frameIntervalP = 1;
-  encode_config.gopLength = NVENC_INFINITE_GOPLENGTH;
-  encode_config.encodeCodecConfig.h264Config.idrPeriod = encode_config.gopLength;
-  encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CBR;
-  encode_config.rcParams.multiPass = NV_ENC_TWO_PASS_FULL_RESOLUTION;
+  m_encode_config.frameIntervalP = 1;
+  m_encode_config.gopLength = NVENC_INFINITE_GOPLENGTH;
+  m_encode_config.encodeCodecConfig.h264Config.idrPeriod = m_encode_config.gopLength;
+  m_encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CBR;
+  m_encode_config.rcParams.multiPass = NV_ENC_TWO_PASS_FULL_RESOLUTION;
  // This produces images with acceptable quality. Tested on Tesla and Trudy scenes on flat screen.
  // We may adjust the value of the bitrate based of results of further testing.
-  encode_config.rcParams.averageBitRate = 2800000;
+  m_encode_config.rcParams.averageBitRate = 2800000;
  // Below is bitrate calculation from the NVIDIA samples app. It's too low for us.
  // Keep it here for now for reference. Remove once bitrate calculation is finalised.
-  // encode_config.rcParams.averageBitRate = (static_cast<unsigned int>(
-  //   5.0f * encoder_init_params.encodeWidth * encoder_init_params.encodeHeight) /
+  // m_encode_config.rcParams.averageBitRate = (static_cast<unsigned int>(
+  //   5.0f * m_encoder_init_params.encodeWidth * m_encoder_init_params.encodeHeight) /
  //   (width * height)) * 100000;
-  VLOG(3) << "Average bitrate: " << encode_config.rcParams.averageBitRate;
-  encode_config.rcParams.vbvBufferSize = (encode_config.rcParams.averageBitRate * encoder_init_params.frameRateDen /
-    encoder_init_params.frameRateNum) * 5;
-  encode_config.rcParams.maxBitRate = encode_config.rcParams.averageBitRate * 2;
-  encode_config.rcParams.vbvInitialDelay = encode_config.rcParams.vbvBufferSize;
+  VLOG(3) << "Average bitrate: " << m_encode_config.rcParams.averageBitRate;
+  m_encode_config.rcParams.vbvBufferSize = (m_encode_config.rcParams.averageBitRate * m_encoder_init_params.frameRateDen /
+    m_encoder_init_params.frameRateNum) * 5;
+  m_encode_config.rcParams.maxBitRate = m_encode_config.rcParams.averageBitRate * 2;
+  m_encode_config.rcParams.vbvInitialDelay = m_encode_config.rcParams.vbvBufferSize;

-  nv_error_code = nvencode_api_function_list.nvEncInitializeEncoder(m_encoder_session_handle.get(), &encoder_init_params);
+  nv_error_code = m_nvencode_api_function_list.nvEncInitializeEncoder(m_encoder_session_handle.get(), &m_encoder_init_params);
  if(nv_error_code != NV_ENC_SUCCESS) {
    std::string message = "FATAL. nvEncInitializeEncoder failed with error: "
      + std::to_string(nv_error_code);
    VLOG(1) << message;
    throw std::runtime_error(message);
  }
+  VLOG(3) << "NvEncoder initialization completed";
 }

 void NvEncoder::allocateOutputBuffer() {
  NV_ENC_CREATE_BITSTREAM_BUFFER create_bitstream_buffer_struct = { NV_ENC_CREATE_BITSTREAM_BUFFER_VER };
-  NVENCSTATUS nv_error_code = nvencode_api_function_list.nvEncCreateBitstreamBuffer(m_encoder_session_handle.get(),
+  NVENCSTATUS nv_error_code = m_nvencode_api_function_list.nvEncCreateBitstreamBuffer(m_encoder_session_handle.get(),
    &create_bitstream_buffer_struct);
  if(nv_error_code != NV_ENC_SUCCESS) {
    std::string message = "FATAL. nvEncCreateBitstreamBuffer failed with error: "
@ -165,14 +200,7 @@ void NvEncoder::allocateOutputBuffer() {
 }

 void NvEncoder::allocateInputBuffer() {
-  CUresult cu_result = cuCtxPushCurrent(m_cuda_context);
-  if(cu_result != CUDA_SUCCESS) {
-    const char *error_name = NULL;
-    cuGetErrorName(cu_result, &error_name);
-    std::string message = "FATAL. cuCtxPushCurrent failed with error: " + std::string(error_name);
-    VLOG(1) << message;
-    throw std::runtime_error(message);
-  }
+  CUDAContextScope cuda_context_scope(m_cuda_context);
  const int chroma_height = getChromaHeight(m_buffer_format, m_height);
  const int height_in_rows = m_height + chroma_height;
  const int width_in_bytes = getWidthInBytes(m_buffer_format, m_width);
@ -180,7 +208,7 @@ void NvEncoder::allocateInputBuffer() {
  CUdeviceptr cuda_device_ptr;
  VLOG(3) << "Allocating input buffer with cuMemAllocPitch cuda_pitch: " << m_cuda_pitch.get() << " width_in_bytes: " <<
    width_in_bytes << " height in rows: " << height_in_rows;
-  cu_result = cuMemAllocPitch((CUdeviceptr *)&cuda_device_ptr,
+  CUresult cu_result = cuMemAllocPitch((CUdeviceptr *)&cuda_device_ptr,
    &cuda_pitch,
    width_in_bytes,
    height_in_rows,
@ -195,14 +223,6 @@ void NvEncoder::allocateInputBuffer() {
  m_cuda_pitch.init(cuda_pitch, __FILE__, __FUNCTION__, __LINE__);
  m_cuda_device_ptr.init(cuda_device_ptr, __FILE__, __FUNCTION__, __LINE__);
  VLOG(3) << "Successfully allocated input buffer with cuMemAllocPitch";
-  cu_result = cuCtxPopCurrent(NULL);
-  if(cu_result != CUDA_SUCCESS) {
-    const char *error_name = NULL;
-    cuGetErrorName(cu_result, &error_name);
-    std::string message = "FATAL. cuCtxPopCurrent failed with error: " + std::string(error_name);
-    VLOG(1) << message;
-    throw std::runtime_error(message);
-  }
  VLOG(3) << "Input CUDA buffer allocation completed";
  registerInputResources(cuda_device_ptr);
 }
@ -221,7 +241,7 @@ void NvEncoder::registerInputResources(CUdeviceptr cuda_device_ptr) {
  register_resource_struct.pOutputFencePoint = nullptr;
  VLOG(3) << "nvEncRegisterResource with width: " << register_resource_struct.width <<
    " height: " << register_resource_struct.height << " pitch: " << register_resource_struct.pitch;
-  NVENCSTATUS nv_error_code = nvencode_api_function_list.nvEncRegisterResource(m_encoder_session_handle.get(),
+  NVENCSTATUS nv_error_code = m_nvencode_api_function_list.nvEncRegisterResource(m_encoder_session_handle.get(),
    &register_resource_struct);
  if(nv_error_code != NV_ENC_SUCCESS) {
    std::string message = "FATAL. nvEncRegisterResource failed with error: "
@ -245,41 +265,56 @@ void NvEncoder::registerInputResources(CUdeviceptr cuda_device_ptr) {
  VLOG(3) << "Successfully registered input buffer resource";
 }

-bool NvEncoder::getEncodedBuffer(NV_ENC_MAP_INPUT_RESOURCE &map_input_resource, AutoEnlargingBuffer<uint8_t> &encoded_buffer_out,
+bool NvEncoder::getEncodedBuffer(NV_ENC_MAP_INPUT_RESOURCE &map_input_resource, std::vector<uint8_t> &encoded_buffer_out,
    high_resolution_clock::time_point & encoding_done) const {
  NV_ENC_LOCK_BITSTREAM lock_bitstream_data = { NV_ENC_LOCK_BITSTREAM_VER };
  lock_bitstream_data.outputBitstream = m_bitstream_output_buffer.get();
  lock_bitstream_data.doNotWait = false;
-  NVENCSTATUS nv_error_code = nvencode_api_function_list.nvEncLockBitstream(m_encoder_session_handle.get(), &lock_bitstream_data);
+  NVENCSTATUS nv_error_code = m_nvencode_api_function_list.nvEncLockBitstream(m_encoder_session_handle.get(), &lock_bitstream_data);
  if(nv_error_code != NV_ENC_SUCCESS) {
    VLOG(1) << "ERROR. nvEncLockBitstream failed with error: " << nv_error_code;
    return false;
  }
  encoding_done = high_resolution_clock::now();
  uint8_t * encoded_data_ptr = (uint8_t *)lock_bitstream_data.bitstreamBufferPtr;
-  encoded_buffer_out.insert(&encoded_data_ptr[0], lock_bitstream_data.bitstreamSizeInBytes);
-  nv_error_code = nvencode_api_function_list.nvEncUnlockBitstream(m_encoder_session_handle.get(), lock_bitstream_data.outputBitstream);
+  encoded_buffer_out.assign(encoded_data_ptr, encoded_data_ptr + lock_bitstream_data.bitstreamSizeInBytes);
+  nv_error_code = m_nvencode_api_function_list.nvEncUnlockBitstream(m_encoder_session_handle.get(), lock_bitstream_data.outputBitstream);
  if(nv_error_code != NV_ENC_SUCCESS) {
    VLOG(1) << "ERROR. nvEncUnlockBitstream failed with error: " << nv_error_code;
    return false;
  }
-  nv_error_code = nvencode_api_function_list.nvEncUnmapInputResource(m_encoder_session_handle.get(), map_input_resource.mappedResource);
+  nv_error_code = m_nvencode_api_function_list.nvEncUnmapInputResource(m_encoder_session_handle.get(), map_input_resource.mappedResource);
  if(nv_error_code != NV_ENC_SUCCESS) {
    VLOG(1) << "ERROR. nvEncUnmapInputResource failed with error: " << nv_error_code;
    return false;
  }
  std::chrono::duration<float> copy_to_cpu_time_sec = high_resolution_clock::now() - encoding_done;
-  VLOG(3) << "Time to copy encoded image buffer to CPU memory: " << copy_to_cpu_time_sec.count() << " sec";
+  VLOG(3) << "Time to copy encoded image buffer to CPU memory: " << copy_to_cpu_time_sec.count() <<
+    " sec. Size: " << encoded_buffer_out.size();
  return true;
 }

-bool NvEncoder::encode(const uint8_t* input_buffer_on_host, AutoEnlargingBuffer<uint8_t> &encoded_buffer_out) const {
+// Do explicit template instantiation for encode() and
+// copyInputBufferIntoGpuMappedMemory() methods (see below) for the follow reasons:
+// we only support buffers in the CPU/host memory as uint8_t* and buffers in the GPU/device memory as CUdeviceptr
+// So instantiate explicitly only for these types. This also allows us to have template
+// methods implementation in the .cpp file instead of .h to keep header clean.
+
+// Explicit template instantiation of the encode for the input buffers in cpu memory as uint8_t*
+template bool NvEncoder::encode(const uint8_t*, std::vector<uint8_t> &) const;
+
+// Explicit template instantiation of the encode for the input buffers in gpu memory as CUdeviceptr
+template bool NvEncoder::encode(const CUdeviceptr, std::vector<uint8_t> &) const;
+
+template<typename T>
+bool NvEncoder::encode(const T input_buffer, std::vector<uint8_t> &encoded_buffer_out) const {
+  CUDAContextScope cuda_context_scope(m_cuda_context);
  if(m_encoder_session_handle.get() == nullptr) {
    VLOG(1) <<  "ERROR. encoder_session_handle is null. Encoder is not initialised or initialization failed.";
    return false;
  }
  NV_ENC_MAP_INPUT_RESOURCE map_input_resource = { NV_ENC_MAP_INPUT_RESOURCE_VER };
-  if(!copyInputBufferToGpu(input_buffer_on_host, map_input_resource)) {
+  if(!copyInputBufferIntoGpuMappedMemory(input_buffer, map_input_resource)) {
    return false;
  }
  auto start_encoding = high_resolution_clock::now();    
@ -305,24 +340,31 @@ bool NvEncoder::encode(const uint8_t* input_buffer_on_host, AutoEnlargingBuffer<
 }

 const NVENCSTATUS NvEncoder::startEncoding(NV_ENC_MAP_INPUT_RESOURCE & map_input_resource) const {
-  NV_ENC_PIC_PARAMS picParams = {};
-  picParams.version = NV_ENC_PIC_PARAMS_VER;
-  picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
-  picParams.inputBuffer = map_input_resource.mappedResource;
-  picParams.bufferFmt = m_buffer_format;
-  picParams.inputWidth = m_width;
-  picParams.inputHeight = m_height;
-  picParams.outputBitstream = m_bitstream_output_buffer.get();
-  picParams.completionEvent = nullptr;
-  return nvencode_api_function_list.nvEncEncodePicture(m_encoder_session_handle.get(), &picParams);
+  NV_ENC_PIC_PARAMS pic_params = {};
+  pic_params.version = NV_ENC_PIC_PARAMS_VER;
+  pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
+  pic_params.inputBuffer = map_input_resource.mappedResource;
+  pic_params.bufferFmt = m_buffer_format;
+  pic_params.inputWidth = m_width;
+  pic_params.inputHeight = m_height;
+  pic_params.outputBitstream = m_bitstream_output_buffer.get();
+  pic_params.completionEvent = nullptr;
+  return m_nvencode_api_function_list.nvEncEncodePicture(m_encoder_session_handle.get(), &pic_params);
 }

-bool NvEncoder::copyInputBufferToGpu(const uint8_t* input_buffer_on_host, NV_ENC_MAP_INPUT_RESOURCE &map_input_resource) const {
+// Explicit template instantiation of the copyInputBufferIntoGpuMappedMemory for the input buffers in cpu memory as uint8_t*
+template bool NvEncoder::copyInputBufferIntoGpuMappedMemory(const uint8_t*, NV_ENC_MAP_INPUT_RESOURCE &) const;
+
+// Explicit template instantiation of the copyInputBufferIntoGpuMappedMemory for the input buffers in gpu memory as CUdeviceptr
+template bool NvEncoder::copyInputBufferIntoGpuMappedMemory(const CUdeviceptr, NV_ENC_MAP_INPUT_RESOURCE &) const;
+
+template<typename T>
+bool NvEncoder::copyInputBufferIntoGpuMappedMemory(T input_buffer,
+    NV_ENC_MAP_INPUT_RESOURCE &map_input_resource) const {
  auto start_copy_to_gpu = high_resolution_clock::now();
  const int width_in_bytes = getWidthInBytes(m_buffer_format, m_width);
  CUDA_MEMCPY2D memcpy2d = { 0 };
-  memcpy2d.srcMemoryType = CU_MEMORYTYPE_HOST;
-  memcpy2d.srcHost = input_buffer_on_host;
+  setSourceBuffer(input_buffer, memcpy2d);
  memcpy2d.srcPitch = getWidthInBytes(m_buffer_format, m_width);
  memcpy2d.dstMemoryType = CU_MEMORYTYPE_DEVICE;
  memcpy2d.dstDevice = m_cuda_device_ptr.get();
@ -331,7 +373,7 @@ bool NvEncoder::copyInputBufferToGpu(const uint8_t* input_buffer_on_host, NV_ENC
  memcpy2d.Height = m_height;
  cuMemcpy2D(&memcpy2d);
  for (int i = 0; i < m_src_chroma_offsets.get().size(); ++i) {
-    memcpy2d.srcHost = (input_buffer_on_host + m_src_chroma_offsets.get()[i]);
+    setSourceBuffer(input_buffer + m_src_chroma_offsets.get()[i], memcpy2d);
    memcpy2d.dstDevice = (CUdeviceptr)((uint8_t *)m_cuda_device_ptr.get() + m_dst_chroma_offsets.get()[i]);
    memcpy2d.srcPitch = getChromaPitch(m_buffer_format, memcpy2d.srcPitch);
    memcpy2d.dstPitch = getChromaPitch(m_buffer_format, memcpy2d.dstPitch);
@ -340,7 +382,7 @@ bool NvEncoder::copyInputBufferToGpu(const uint8_t* input_buffer_on_host, NV_ENC
    cuMemcpy2D(&memcpy2d);
  }
  map_input_resource.registeredResource = m_input_buffer_registration.get();
-  NVENCSTATUS nv_error_code = nvencode_api_function_list.nvEncMapInputResource(m_encoder_session_handle.get(), &map_input_resource);
+  NVENCSTATUS nv_error_code = m_nvencode_api_function_list.nvEncMapInputResource(m_encoder_session_handle.get(), &map_input_resource);
  if(nv_error_code != NV_ENC_SUCCESS) {
    VLOG(3) << "ERROR. nvEncMapInputResource failed with error: " << std::to_string(nv_error_code);
    return false;
@ -350,6 +392,16 @@ bool NvEncoder::copyInputBufferToGpu(const uint8_t* input_buffer_on_host, NV_ENC
  return true;
 }

+void NvEncoder::setSourceBuffer(const uint8_t * input_buffer, CUDA_MEMCPY2D &memcpy2d) const {
+  memcpy2d.srcHost = input_buffer;
+  memcpy2d.srcMemoryType = CU_MEMORYTYPE_HOST;
+}
+
+void NvEncoder::setSourceBuffer(const CUdeviceptr input_buffer, CUDA_MEMCPY2D &memcpy2d) const {
+  memcpy2d.srcDevice = input_buffer;
+  memcpy2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+}
+
 uint32_t NvEncoder::getWidthInBytes(const NV_ENC_BUFFER_FORMAT buffer_format, const uint32_t width) const {
  switch (buffer_format) {
    case NV_ENC_BUFFER_FORMAT_NV12:
--- a/intern/cycles/cluster_rendering/libcluster/compression/nv_encoder.h
+++ b/intern/cycles/cluster_rendering/libcluster/compression/nv_encoder.h
@ -1,76 +1,36 @@
 #ifndef __NV_ENCODER_H__
 #define __NV_ENCODER_H__

+#include <chrono>
 #include <vector>
-//#include <cuda.h>
+
 #ifdef WITH_CUDA_DYNLOAD
-#    include "cuew.h"
+  #include <cuew.h>
+  // Do not use CUDA SDK headers when using CUEW
+  // The macro below is used by Optix SDK and is necessary to avoid DSO loading collision
+  // See device_optix.cpp for example.
+  #define OPTIX_DONT_INCLUDE_CUDA
 #else
  #include <cuda.h>
-#    include <cudaGL.h>
 #endif

 #include "final.h"
 #include "nvEncodeAPI.h"
+#include "../utils/logging.h"

 namespace cgr_libcluster {

 using namespace std::chrono;

-// This class holds preallocated buffer with requested initial size.
-// It enlarges the buffer automatically if it cannot accomodate incoming data
-// Buffer does not shrink, it only enlarges when needed to reduce amount of memory re-allocations.
-template <typename T>
-class AutoEnlargingBuffer {
-public:
-  AutoEnlargingBuffer(size_t init_size){
-    if(init_size > 0) {
-      resize(init_size);
-    }
-  }
-
-  ~AutoEnlargingBuffer(){
-    delete [] data_buffer;
-  }
-
-  void insert(const T* input_data, const size_t input_data_size) {
-    if(input_data_size > m_capacity) {
-      VLOG(3) << "Current capacity of the AutoEnlargingBuffer: " << m_capacity <<
-        " Reallocating to support larger data size: " << input_data_size;
-      resize(input_data_size);
-    }
-    memcpy(data_buffer, input_data, input_data_size);
-    m_data_size = input_data_size;
-  }
-
-  T* data() const {
-    return data_buffer;
-  }
-
-  size_t size() const {
-    return m_data_size;
-  }
-
-private:
-  T* data_buffer = nullptr;
-  size_t m_data_size = 0;
-  size_t m_capacity = 0;
-
-  void resize(const size_t new_buffer_size) {
-    delete [] data_buffer;
-    data_buffer = new T[new_buffer_size];
-    m_capacity = new_buffer_size;
-    m_data_size = 0;
-  }
-};
-
 class NvEncoder {
 public:
-  static const NV_ENC_BUFFER_FORMAT BUFFER_FORMAT_FOR_IMAGES_FROM_MASTER = NV_ENC_BUFFER_FORMAT_ABGR;

  NvEncoder(NV_ENC_BUFFER_FORMAT buffer_format, CUcontext cuda_context, const int width, const int height);
  ~NvEncoder();
-  bool encode(const uint8_t* input_buffer_on_host, AutoEnlargingBuffer<uint8_t> &encoded_buffer_out) const;
+
+  // Encodes an image within the input_buffer
+  template<typename T>
+  bool encode(const T input_buffer, std::vector<uint8_t> &encoded_buffer_out) const;

 private:
  static constexpr int DEVICE_NUM = 0;
@ -80,7 +40,9 @@ private:
  const int m_width;
  const int m_height;
  const NV_ENC_BUFFER_FORMAT m_buffer_format;
-  NV_ENCODE_API_FUNCTION_LIST nvencode_api_function_list;
+  NV_ENC_INITIALIZE_PARAMS m_encoder_init_params;
+  NV_ENC_CONFIG m_encode_config;
+  NV_ENCODE_API_FUNCTION_LIST m_nvencode_api_function_list;
  Final<void *> m_encoder_session_handle;
  Final<size_t> m_cuda_pitch;
  Final<CUdeviceptr> m_cuda_device_ptr;
@ -111,11 +73,17 @@ private:
  // Registers preallocated input CUDA buffer for the NvEncoder with nvEncRegisterResource
  void registerInputResources(CUdeviceptr cuda_device_ptr);

-  // Copies input image buffer (on CPU) to the preallocated and registered encoder input buffer on GPU (CUDA device)
-  bool copyInputBufferToGpu(const uint8_t* input_buffer_on_host, NV_ENC_MAP_INPUT_RESOURCE &map_input_resource) const;
+  // Copies input image buffer (on CPU or GPU memory) to the preallocated and registered
+  // encoder input buffer on GPU (CUDA device)
+  template<typename T>
+  bool copyInputBufferIntoGpuMappedMemory(T input_buffer, NV_ENC_MAP_INPUT_RESOURCE &map_input_resource) const;
+  void setSourceBuffer(const uint8_t * input_buffer, CUDA_MEMCPY2D &memcpy2d) const;
+  void setSourceBuffer(const CUdeviceptr input_buffer, CUDA_MEMCPY2D &memcpy2d) const;
+

  // Copies encoded image from the encoder to the preallocated output buffer on CPU
-  bool getEncodedBuffer(NV_ENC_MAP_INPUT_RESOURCE &map_input_resource, AutoEnlargingBuffer<uint8_t> &encoded_buffer_out,
+  bool getEncodedBuffer(NV_ENC_MAP_INPUT_RESOURCE &map_input_resource,
+      std::vector<uint8_t> &encoded_buffer_out,
      high_resolution_clock::time_point & encoding_done) const;

  // Starts image encoding
--- a/intern/cycles/cluster_rendering/libcluster/compression/turbojpeg_compressor.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/compression/turbojpeg_compressor.cpp
@ -0,0 +1,124 @@
+
+//#include "./utils/timer.h"    // for scoped_timer
+#include "../utils/logging.h"
+#include "turbojpeg_compressor.h"
+
+namespace cgr_libcluster {
+
+TurbojpegCompressor::TurbojpegCompressor() {
+  m_jpeg_compressor = tjInitCompress();
+  m_jpeg_decompressor = tjInitDecompress();
+}
+
+TurbojpegCompressor::~TurbojpegCompressor() {
+  if (m_jpeg_compressor != nullptr) {
+    tjDestroy(m_jpeg_compressor);
+  }
+  if (m_jpeg_decompressor != nullptr) {
+    tjDestroy(m_jpeg_decompressor);
+  }
+}
+
+//#define TIME_JPEG
+size_t TurbojpegCompressor::compress(void* src_buffer, int width, int height, int compression_quality,
+    unsigned char*& jpeg_image) {
+    // Convert buffer to unsigned char * 3 channels
+    const int subsampling = TJSAMP_444;
+    size_t jpeg_length = 0; // tjCompress2 will allocate the jpeg_image buffer
+    jpeg_image = nullptr;
+
+#ifdef TIME_JPEG
+    struct timespec start_time, end_time;
+    clock_gettime(CLOCK_MONOTONIC, &start_time);
+#endif
+    if (m_jpeg_compressor == nullptr) {
+      LOG(ERROR) << "Cannot initialize JPEG compressor";
+      return 0;
+    }
+    int jpeg_error = tjCompress2(m_jpeg_compressor,
+                                 (unsigned char*) src_buffer,
+                                 width,
+                                 0,
+                                 height,
+                                 TJPF_RGB,
+                                 &jpeg_image,
+                                 (unsigned long *)&jpeg_length,
+                                 subsampling,
+                                 compression_quality,
+                                 TJFLAG_FASTDCT);
+    if (jpeg_error < 0) {
+      const char *jpeg_error_str = tjGetErrorStr();
+      LOG(ERROR) << "JPEG compression error: " << jpeg_error_str;
+      return 0;
+    }
+
+#ifdef TIME_JPEG
+    clock_gettime(CLOCK_MONOTONIC, &end_time);
+    // ms time
+    double elapsed_time = (end_time.tv_nsec - start_time.tv_nsec) / 1e6;
+    LOG(INFO) << "TIMING: JPEG compression: " << elapsed_time << "ms"
+        << ", resolution " << w << "x" << h
+        << ", sizes " << src_buffer.size() << " (" << jpeg_length << ")";
+#endif
+    return jpeg_length;
+}
+
+bool TurbojpegCompressor::decompress(std::vector<uint8_t> & jpeg_image_buffer, int & width_out, int & height_out,
+    std::vector<cgr_libcluster::uchar3> & decompressed_image_out) {
+#ifdef TIME_JPEG
+  struct timespec start_time, end_time;
+  clock_gettime(CLOCK_MONOTONIC, &start_time);
+#endif
+  // Use TurboJPEG to decompress the buffer
+  int subsampling = 0;
+  if (m_jpeg_decompressor == nullptr) {
+    LOG(ERROR) << "Cannot initialize JPEG decompressor";
+    return false;
+  }
+  int width = 0, height = 0;
+  int jpeg_error = tjDecompressHeader2(m_jpeg_decompressor, jpeg_image_buffer.data(),
+          jpeg_image_buffer.size(), &width, &height, &subsampling);
+  if (jpeg_error < 0) {
+    LOG(ERROR) << "Cannot decode JPEG header from incoming image buffer";
+    return false;
+  }
+  width_out = width;
+  height_out = height;
+  const size_t num_pixels = width_out * height_out;
+  if(decompressed_image_out.size() != num_pixels) {
+    decompressed_image_out.resize(num_pixels);
+  }
+  jpeg_error = tjDecompress2(m_jpeg_decompressor,
+                              jpeg_image_buffer.data(),
+                              jpeg_image_buffer.size(),
+                              (unsigned char*) decompressed_image_out.data(),
+                              width_out,
+                              0,
+                              height_out,
+                              TJPF_RGB,
+                              TJFLAG_ACCURATEDCT);
+  //tjDestroy(jpeg_decompressor); // move to d-tor?
+  if (jpeg_error < 0) {
+    const char *jpeg_error_str = tjGetErrorStr();
+    LOG(ERROR) << "JPEG decompression error: " << jpeg_error_str;
+    return false;
+  }
+
+#ifdef TIME_JPEG
+  clock_gettime(CLOCK_MONOTONIC, &end_time);
+  // ms time
+  double elapsed_time = (end_time.tv_nsec - start_time.tv_nsec) / 1e6;
+  LOG(INFO) << "TIMING: JPEG decompression: " << elapsed_time << "ms"
+      << ", resolution " << w << "x" << h
+      << ", sizes " << jpeg_image_buffer.size() << " (" << dst_buffer.size() << ")";
+#endif
+
+    return true;
+}
+
+void TurbojpegCompressor::free(uint8_t *memory) {
+  tjFree(memory);
+}
+
+
+} // cgr_libcluster
--- a/intern/cycles/cluster_rendering/libcluster/compression/turbojpeg_compressor.h
+++ b/intern/cycles/cluster_rendering/libcluster/compression/turbojpeg_compressor.h
@ -0,0 +1,34 @@
+#ifndef __TURBOJPEG_COMPRESSOR_H__
+#define __TURBOJPEG_COMPRESSOR_H__
+
+#include <vector>
+
+#include <turbojpeg.h>
+
+#include "../utils/vector_types.h"    // for uchar3
+
+//#include "net_camera.h"
+
+namespace cgr_libcluster {
+
+class TurbojpegCompressor {
+ public:
+  TurbojpegCompressor();
+  ~TurbojpegCompressor();
+  
+  // Compress the image buffer into a jpeg image
+  size_t compress(void* src_buffer, int width, int height, int compression_quality, unsigned char*& jpeg_image) ;
+  // Decompress a jpeg stream cbuffer into the StreamedImage
+  bool decompress(std::vector<uint8_t> & jpeg_image_buffer, int & width_out, int & height_out,
+    std::vector<cgr_libcluster::uchar3> & decompressed_image_out);
+
+  void free(uint8_t *memory);
+
+ private:
+  tjhandle m_jpeg_compressor = nullptr;
+  tjhandle m_jpeg_decompressor = nullptr;
+};
+
+} // cgr_libcluster
+
+#endif
--- a/intern/cycles/cluster_rendering/libcluster/cuda_context_provider.h
+++ b/intern/cycles/cluster_rendering/libcluster/cuda_context_provider.h
@ -0,0 +1,92 @@
+#ifndef __CUDA_CONTEXT_PROVIDER_H__
+#define __CUDA_CONTEXT_PROVIDER_H__
+
+#include "./utils/logging.h"
+
+#ifdef WITH_CUDA_DYNLOAD
+  #include <cuew.h>
+  // Do not use CUDA SDK headers when using CUEW
+  // The macro below is used by Optix SDK and is necessary to avoid DSO loading collision
+  // See device_optix.cpp for example.
+  #define OPTIX_DONT_INCLUDE_CUDA
+#  else
+#    include <cuda.h>
+#endif
+
+namespace cgr_libcluster {
+
+struct CUDAContextScope {
+  CUDAContextScope(CUcontext ctx) {
+    CUresult cu_result = cuCtxPushCurrent(ctx);
+    if(cu_result != CUDA_SUCCESS) {
+      const char *error_name = NULL;
+      cuGetErrorName(cu_result, &error_name);
+      std::string message = "FATAL. cuCtxPushCurrent failed with error: " + std::string(error_name);
+      LOG(ERROR) << message;
+      throw std::runtime_error(message);
+    }
+  }
+
+  ~CUDAContextScope() {
+    CUresult cu_result = cuCtxPopCurrent(NULL);
+    if(cu_result != CUDA_SUCCESS) {
+      const char *error_name = NULL;
+      cuGetErrorName(cu_result, &error_name);
+      std::string message = "FATAL. cuCtxPopCurrent failed with error: " + std::string(error_name);
+      LOG(ERROR) << message;
+      throw std::runtime_error(message);
+    }
+  }
+};
+
+class CudaContextProvider {
+
+public:
+
+  static CUcontext getPrimaryContext(const int device_num) {
+    static CUcontext cuda_context = 0;
+    if(cuda_context) {
+      return cuda_context;
+    }
+  #ifdef WITH_CUDA_DYNLOAD
+    LOG(INFO) << "WITH_CUDA_DYNLOAD is on, call cuewInit to load CUDA library";
+    // When we run as a part of the blender the CUDA lib is normally already loaded by CUDADevice
+    // However it's not loaded when is called from unit tests so call cuewInit here to be sure
+    // that CUDA lib is loaded, it's safe to call cuewInit multiple times
+    if (cuewInit(CUEW_INIT_CUDA) != CUEW_SUCCESS) {
+      throw std::runtime_error("Error. CUEW failed to load CUDA lib");
+    }
+  #endif
+
+    CUdevice cuda_device = 0;
+    CUresult cu_result = cuDeviceGet(&cuda_device, device_num);
+    // CUDA is normally initialised by CUDADevice however it's not initialised
+    // when is called from unit tests, check if CUDA is initialised, run cuInit if not.
+    if(cu_result == CUDA_ERROR_NOT_INITIALIZED) {
+      LOG(WARNING) << "Cuda is not initialised, run cuInit";
+      cu_result = cuInit(0);
+      if (cu_result != CUDA_SUCCESS) {
+        std::string message = "Error. Cannot initialise CUDA, cuInit failed "
+          "with error code: "  + std::to_string(cu_result);
+        throw std::runtime_error(message);  
+      }
+      cu_result = cuDeviceGet(&cuda_device, device_num);
+    }
+    if(cu_result != CUDA_SUCCESS) {
+      std::string message = "Error. Cannot get primary cuda context due to cuDeviceGet failed "
+        "with error code: " + std::to_string(cu_result);
+      throw std::runtime_error(message);
+    }
+    cu_result = cuDevicePrimaryCtxRetain(&cuda_context, cuda_device);
+    if(cu_result != CUDA_SUCCESS) {
+      std::string message = "Error. Failed to get primary cuda context "
+        "with error code: "  + std::to_string(cu_result);
+      throw std::runtime_error(message);
+    }
+    return cuda_context;
+  }
+};
+
+} // namespace cgr_libcluster
+
+#endif
--- a/intern/cycles/cluster_rendering/libcluster/denoising/denoising_context.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/denoising/denoising_context.cpp
@ -1,3 +1,6 @@
+#ifdef WITH_CUDA
+   #include "../cuda_context_provider.h"
+#endif
 #include "denoising_context.h"
 #include "master_oidn_denoiser.h"
 #ifdef WITH_OPTIX
@ -26,7 +29,8 @@ MasterDenoiser * DenoisersProvider::makeOptixDenoiser(bool save_denoise_io, int
    const std::string & output_folder_path, bool is_denoising_passes_on,
    const ImageOutputProvider & image_output_provider, int max_img_width, int max_img_height) const {
    try {
-      return new MasterOptixDenoiser(save_denoise_io, save_every_n_images, output_folder_path,
+      CUcontext cuda_context = CudaContextProvider::getPrimaryContext(MasterOptixDenoiser::DEVICE_NUM);
+      return new MasterOptixDenoiser(cuda_context, save_denoise_io, save_every_n_images, output_folder_path,
        is_denoising_passes_on, image_output_provider, max_img_width, max_img_height);
    } catch(std::runtime_error & ex) {
      LOG(ERROR) << "DenoisersProvider::makeDenoiser. ERROR. Failed to create an instance of MasterOptixDenoiser due to: "
--- a/intern/cycles/cluster_rendering/libcluster/denoising/master_optix_denoiser.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/denoising/master_optix_denoiser.cpp
@ -1,3 +1,6 @@
+
+#include "../cuda_context_provider.h"
+#include "../utils/cuda_utils.h"
 #include "../utils/timer.h" // for scoped_timer
 #include "../utils/logging.h"
 #include "../server_image.h" // for ServerImage;
@ -7,28 +10,24 @@
 #include <optix_denoiser_tiling.h>
 #include <optix.h>

+
 namespace cgr_libcluster {

-struct CUDAContextScope {
-  CUDAContextScope(CUcontext ctx) {
-    cuCtxPushCurrent(ctx);
-  }
-
-  ~CUDAContextScope() {
-    cuCtxPopCurrent(NULL);
-  }
-};
-
 // Constructor throws exception if it cannot create an instance.
 // This prevents creation of the object that is not functional.
 // Doesn't affect rendering performance as happens once at init stage
-MasterOptixDenoiser::MasterOptixDenoiser(bool save_denoise_io, int save_every_n_images, const std::string & output_folder_path,
-    bool is_denoising_passes_on, const ImageOutputProvider & image_output_provider, int max_img_width, int max_img_height) :
+MasterOptixDenoiser::MasterOptixDenoiser(CUcontext cuda_context, bool save_denoise_io, int save_every_n_images,
+    const std::string & output_folder_path, bool is_denoising_passes_on,
+    const ImageOutputProvider & image_output_provider, int max_img_width, int max_img_height) :
    MasterDenoiser(
      ClusterSessionParams::MasterDenoiser::MASTER_DENOISER_OPTIX, save_denoise_io, save_every_n_images,
      output_folder_path, is_denoising_passes_on, image_output_provider),
-      max_img_width(max_img_width), max_img_height(max_img_height) {
-  LOG(INFO) << "Creating MasterOptixDenoiser";
+      max_img_width(max_img_width), max_img_height(max_img_height),
+      cuda_context(cuda_context) {
+  LOG(INFO) << "Creating MasterOptixDenoiser with width: " << max_img_width << " height: " << max_img_height <<
+    " save_io: " << save_denoise_io << " save_every_n_images: " << save_every_n_images << " output_folder_path: " <<
+    output_folder_path << " is_denoising_passes_on: " << is_denoising_passes_on;
+
  OptixDeviceContext optix_context = nullptr;
  OptixDenoiserOptions denoiser_options;
  if(is_denoising_passes_on) {
@ -39,166 +38,69 @@ MasterOptixDenoiser::MasterOptixDenoiser(bool save_denoise_io, int save_every_n_
    denoiser_options.guideNormal = 0;
  }
  //denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
-  CUdevice cuda_device = 0;
+
  OptixDeviceContextOptions options = {};
-
-#ifdef WITH_CUDA_DYNLOAD
-  LOG(INFO) << "WITH_CUDA_DYNLOAD is on, call cuewInit to load CUDA library";
-  // When we run as a part of the blender the CUDA lib is normally already loaded by CUDADevice
-  // However it's not loaded when is called from unit tests so call cuewInit here to be sure
-  // that CUDA lib is loaded, it's safe to call cuewInit multiple times
-  if (cuewInit(CUEW_INIT_CUDA) != CUEW_SUCCESS) {
-    throw std::runtime_error("Error. CUEW failed to load CUDA lib");
-  }
-#endif
-
-  CUresult cu_result = cuDeviceGet(&cuda_device, DEVICE_NUM);
-  // CUDA is normally initialised by CUDADevice however it's not initialised
-  // when is called from unit tests, check if CUDA is initialised, run cuInit if not.
-  if(cu_result == CUDA_ERROR_NOT_INITIALIZED) {
-    LOG(WARNING) << "Cuda is not initialised, run cuInit";
-    cu_result = cuInit(0);
-    if (cu_result != CUDA_SUCCESS) {
-      std::string message = "Error. Cannot initialise CUDA, cuInit failed "
-        "with error code: "  + std::to_string(cu_result);
-      throw std::runtime_error(message);
-    }
-    cu_result = cuDeviceGet(&cuda_device, DEVICE_NUM);
-  }
-
-  if(cu_result != CUDA_SUCCESS) {
-    std::string message = "Error. Cannot create MasterOptixDenoiser due to cuDeviceGet failed "
-      "with error code: " + std::to_string(cu_result);
-    throw std::runtime_error(message);
-  }
-
-  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
-  cu_result = cuCtxCreate(&cuda_context, ctx_flags, cuda_device);
-  if(cu_result != CUDA_SUCCESS) {
-    std::string message = "Error. Cannot create MasterOptixDenoiser due to cuCtxCreate failed "
-      "with error code: "  + std::to_string(cu_result);
-    throw std::runtime_error(message);
-  }
  const CUDAContextScope scope(cuda_context);

  if (g_optixFunctionTable.optixDeviceContextCreate != NULL) {
    LOG(INFO) << "Optix function table is already initialized, continue.";
  } else {
    LOG(INFO) << "Initializing Optix...";
-    OptixResult optix_result = optixInit();
-    if(optix_result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
-      std::string message = "Error. OptiX initialization failed because the installed driver does not support "
-      "ABI version: "  + std::to_string(OPTIX_ABI_VERSION);
-      throw std::runtime_error(message);
-    }
-    else if(optix_result != OPTIX_SUCCESS) {
-      std::string message = "Error. OptiX initialization failed with error code: " + std::to_string(optix_result);
-      throw std::runtime_error(message);
-    }
+    OPTIX_API_CALL(optixInit(), THROW_IF_ERROR);
    LOG(INFO) << "Initialization of Optix function table complete";
  }
-
-  OptixResult optix_result = optixDeviceContextCreate(cuda_context, &options, &optix_context);
-  if(optix_result != OPTIX_SUCCESS) {
-    std::string message = "Error. Cannot create MasterOptixDenoiser due to optixDeviceContextCreate failed "
-      "with error code: "  + std::to_string(optix_result);
-    throw std::runtime_error(message);
-  }
+  OPTIX_API_CALL(optixDeviceContextCreate(cuda_context, &options, &optix_context), THROW_IF_ERROR);

  //TODO:  figure out how to support KIND_TEMPORAL, needs vecocity vector
-  optix_result = optixDenoiserCreate(optix_context, OPTIX_DENOISER_MODEL_KIND_HDR,
-    &denoiser_options, &denoiser);
-  if(optix_result != OPTIX_SUCCESS) {
-    std::string message = "Error. Cannot create MasterOptixDenoiser due to optixDeviceContextCreate failed "
-      "with error code: "  + std::to_string(optix_result);
-    throw std::runtime_error(message);
-  }
+  OPTIX_API_CALL(optixDenoiserCreate(optix_context, OPTIX_DENOISER_MODEL_KIND_HDR,
+    &denoiser_options, &denoiser), THROW_IF_ERROR);
  if(denoiser == nullptr) {
    throw std::runtime_error("Error. Cannot create MasterOptixDenoiser due to optixDenoiserCreate returned no denoiser");
  }

-  /*
-  optix_result = optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, nullptr, 0);
-  if(optix_result != OPTIX_SUCCESS) {
-    std::string message = "Error. Cannot create MasterOptixDenoiser due to optixDenoiserSetModel failed "
-      "with error code: "  + std::to_string(optix_result);
-    throw std::runtime_error(message);
-  }
-  */
+  // OPTIX_API_CALL(optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, nullptr, 0), THROW_IF_ERROR);
+
  memset(&denoiser_sizes, 0, sizeof(OptixDenoiserSizes));
-  optix_result = optixDenoiserComputeMemoryResources(denoiser, max_img_width, max_img_height, &denoiser_sizes);
-  if(optix_result != OPTIX_SUCCESS) {
-    std::string message = "Error. Cannot create MasterOptixDenoiser due to optixDenoiserComputeMemoryResources failed "
-      "with error code: "  + std::to_string(optix_result);
-    throw std::runtime_error(message);
-  }
+  OPTIX_API_CALL(optixDenoiserComputeMemoryResources(denoiser, max_img_width, max_img_height, &denoiser_sizes),
+    THROW_IF_ERROR);

-  cu_result = cuMemAlloc(&state_denoiser, denoiser_sizes.stateSizeInBytes);
-  if(cu_result != CUDA_SUCCESS) {
-    std::string message = "Error. Cannot create MasterOptixDenoiser due to cuMemAlloc for state_denoiser failed "
-      "with error code: "  + std::to_string(cu_result);
-    throw std::runtime_error(message);
-  }
-
-  cu_result = cuMemAlloc(&scratch_denoiser, denoiser_sizes.withoutOverlapScratchSizeInBytes);
-  if(cu_result != CUDA_SUCCESS) {
-    std::string message = "Error. Cannot create MasterOptixDenoiser due to cuMemAlloc for scratch_denoiser failed "
-      "with error code: "  + std::to_string(cu_result);
-    throw std::runtime_error(message);
-  }
-  optix_result = optixDenoiserSetup(
+  CUDA_API_CALL(cuMemAlloc(&state_denoiser, denoiser_sizes.stateSizeInBytes), THROW_IF_ERROR);
+  CUDA_API_CALL(cuMemAlloc(&scratch_denoiser, denoiser_sizes.withoutOverlapScratchSizeInBytes), THROW_IF_ERROR);
+  CUDA_API_CALL(cuStreamCreate(&cuda_stream, CU_STREAM_DEFAULT), THROW_IF_ERROR);
+  OPTIX_API_CALL(optixDenoiserSetup(
    denoiser,
-    0, // cuda stream,
+    cuda_stream,
    max_img_width,
    max_img_height,
    state_denoiser,
    denoiser_sizes.stateSizeInBytes,
    scratch_denoiser,
-    denoiser_sizes.withoutOverlapScratchSizeInBytes);
-  if(optix_result != OPTIX_SUCCESS) {
-    std::string message = "Error. Cannot create MasterOptixDenoiser due to optixDenoiserSetup failed "
-      "with error code: "  + std::to_string(optix_result);
-    throw std::runtime_error(message);
-  }
+    denoiser_sizes.withoutOverlapScratchSizeInBytes), THROW_IF_ERROR);
  allocateCudaBuffer(&cuda_pixels_buffer, getMaxBufferSize(), "input/noisy pixels buffer");
+
  if(is_denoising_passes_on) {
    allocateCudaBuffer(&cuda_albedo_buffer, getMaxBufferSize(), "input albedo buffer");
    allocateCudaBuffer(&cuda_normal_buffer, getMaxBufferSize(), "input normal buffer");
  }
  allocateCudaBuffer(&cuda_denoised_pixels_buffer, getMaxBufferSize(), "denoised pixels buffer");
+
  LOG(INFO) << "MasterOptixDenoiser creation is complete";
 }

 MasterOptixDenoiser::~MasterOptixDenoiser() {
  LOG(INFO) << "Destroing MasterOptixDenoiser";
-  CUresult cu_result = cuMemFree(scratch_denoiser);
-  if(cu_result != CUDA_SUCCESS) {
-    LOG(ERROR) << "Error. cuMemFree failed for scratch_denoiser with error code: " << std::to_string(cu_result);
-  }
-  cu_result = cuMemFree(state_denoiser);
-  if(cu_result != CUDA_SUCCESS) {
-    LOG(ERROR) << "Error. cuMemFree failed for state_denoiser with error code: " << std::to_string(cu_result);
-  }
-  OptixResult optix_result = optixDenoiserDestroy(denoiser);
-  if(optix_result != OPTIX_SUCCESS) {
-    LOG(ERROR) << "Error. optixDenoiserDestroy failed with error code: " << optix_result;
-  }
+  const CUDAContextScope scope(cuda_context);
+  CUDA_API_CALL(cuMemFree(scratch_denoiser), DO_NOT_THROW);
+  CUDA_API_CALL(cuMemFree(state_denoiser), DO_NOT_THROW);
+  CUDA_API_CALL(cuStreamDestroy(cuda_stream), DO_NOT_THROW);
+  OPTIX_API_CALL(optixDenoiserDestroy(denoiser), DO_NOT_THROW);
  releaseCudaBuffers();
-  cu_result = cuCtxDestroy(cuda_context);
-  if(cu_result != CUDA_SUCCESS) {
-    LOG(ERROR) << "Error. cuCtxDestroy failed with error code: " << std::to_string(cu_result);
-  }
  LOG(INFO) << "Destroyed MasterOptixDenoiser";
 }

 void MasterOptixDenoiser::allocateCudaBuffer(CUdeviceptr * buffer_ptr, size_t buffer_size, const std::string & buffer_name) {
  LOG(INFO) << "Allocating cuda memory for " << buffer_name;
-  CUresult cu_result = cuMemAlloc(buffer_ptr, buffer_size);
-  if(cu_result != CUDA_SUCCESS) {
-    std::string message = "Error. Could not allocate memory for " + buffer_name + " on device. Cuda error code: "
-      + std::to_string(cu_result);
-    throw std::runtime_error(message);
-  }
+  CUDA_API_CALL(cuMemAlloc(buffer_ptr, buffer_size), THROW_IF_ERROR);
 }

 size_t MasterOptixDenoiser::getMaxBufferSize() {
@ -206,32 +108,17 @@ size_t MasterOptixDenoiser::getMaxBufferSize() {
 }

 void MasterOptixDenoiser::releaseCudaBuffers() {
-  CUresult cu_result = cuMemFree(cuda_pixels_buffer);
-  if(cu_result != CUDA_SUCCESS) {
-    "Error. cuMemFree failed for cuda_pixels_buffer with error code:" + std::to_string(cu_result);
-  }
+  CUDA_API_CALL(cuMemFree(cuda_pixels_buffer), DO_NOT_THROW);
  cuda_pixels_buffer = 0;
-
  if(cuda_albedo_buffer) {
-    cu_result = cuMemFree(cuda_albedo_buffer);
-    if(cu_result != CUDA_SUCCESS) {
-      "Error. cuMemFree failed for cuda_albedo_buffer with error code:" + std::to_string(cu_result);
-    }
+    CUDA_API_CALL(cuMemFree(cuda_albedo_buffer), DO_NOT_THROW);
    cuda_albedo_buffer = 0;
  }
-
  if(cuda_normal_buffer) {
-    cu_result = cuMemFree(cuda_normal_buffer);
-    if(cu_result != CUDA_SUCCESS) {
-      "Error. cuMemFree failed for cuda_normal_buffer with error code:" + std::to_string(cu_result);
-    }
+    CUDA_API_CALL(cuMemFree(cuda_normal_buffer), DO_NOT_THROW);
    cuda_normal_buffer = 0;
  }
-
-  cu_result = cuMemFree(cuda_denoised_pixels_buffer);
-  if(cu_result != CUDA_SUCCESS) {
-    "Error. cuMemFree failed for cuda_denoised_pixels_buffer with error code:" + std::to_string(cu_result);
-  }
+  CUDA_API_CALL(cuMemFree(cuda_denoised_pixels_buffer), DO_NOT_THROW);
  cuda_denoised_pixels_buffer = 0;
 }

@ -252,11 +139,7 @@ bool MasterOptixDenoiser::initInputOptixImage(OptixImage2D &input_image, CUdevic
  const int row_stride = server_image.getWidth() * pixel_stride;
  const size_t buffer_size_bytes = sizeof(cgr_libcluster::float3) * server_image.getWidth() * server_image.getHeight();

-  CUresult cu_result = cuMemcpyHtoD(input_buffer_device, input_buffer_host, buffer_size_bytes);
-  if(cu_result != CUDA_SUCCESS) {
-    LOG(ERROR) << "Error. Could not copy input buffer from host to device memory. CUDA error code: " << cu_result;
-    return false;
-  }
+  CUDA_API_CALL(cuMemcpyHtoD(input_buffer_device, input_buffer_host, buffer_size_bytes), THROW_IF_ERROR);
  input_image.data = input_buffer_device;
  input_image.width  =  server_image.getWidth();
  input_image.height = server_image.getHeight();
@ -267,6 +150,7 @@ bool MasterOptixDenoiser::initInputOptixImage(OptixImage2D &input_image, CUdevic
 }

 DenoisingResult MasterOptixDenoiser::denoise(ServerImage &server_image) {
+  const CUDAContextScope scope(cuda_context);
  if(save_denoise_io) {
    if (save_every_n_images <= 0) {
      LOG(ERROR) << "Error: invalid save_every_n_images: " << save_every_n_images;
@ -314,7 +198,7 @@ DenoisingResult MasterOptixDenoiser::denoise(ServerImage &server_image) {

  OptixResult optix_result = optixDenoiserInvoke(
    denoiser,
-    0, // cuda stream
+    cuda_stream,
    &params_denoiser,
    state_denoiser,
    denoiser_sizes.stateSizeInBytes,
@ -329,24 +213,28 @@ DenoisingResult MasterOptixDenoiser::denoise(ServerImage &server_image) {
    LOG(ERROR) << "Error. Optix denoise failed. OPTIX error code: " << optix_result;
    return DenoisingResult::FAILED;
  }
-
-  const size_t buffer_size_bytes = sizeof(cgr_libcluster::float3) * server_image.getWidth() *  server_image.getHeight();
-  CUresult cu_result = cuMemcpyDtoH(server_image.denoised_pixels, cuda_denoised_pixels_buffer, buffer_size_bytes);
-  if(cu_result != CUDA_SUCCESS) {
-    LOG(ERROR) << "Error. Could not copy image buffer from device to host memory. CUDA error code: " << cu_result;
-    return DenoisingResult::FAILED;
-  }
-
-  server_image.denoised = true;
+  // explicitly wait till denoising is complete
+  cuStreamSynchronize(cuda_stream);
+  denoised_buffer_size_bytes = sizeof(cgr_libcluster::float3) * server_image.getWidth() * server_image.getHeight();
  LOG(INFO) << "denoising time for frame: " << server_image.camera.frame << " " << denoizer_timer.elapsed();
-  if(save_denoise_io) {
-    if (save_every_n_images <= 0) {
-      LOG(ERROR) << "Error: invalid save_every_n_images: " << save_every_n_images;
-    } else if (server_image.getFrame() % save_every_n_images == 0) {
-      saveDenoisedImage(server_image);
+  return DenoisingResult::OK;
 }
+
+CUdeviceptr MasterOptixDenoiser::getCudaMemPointerToDenoisedImage() const {
+  return cuda_denoised_pixels_buffer;
 }
-  return server_image.denoised ? DenoisingResult::OK : DenoisingResult::FAILED;
+
+CUstream MasterOptixDenoiser::getCudaStream() const {
+  return cuda_stream;
+}
+
+void MasterOptixDenoiser::copyDenoisedImageFromCudaToHostMemory(uint8_t * dest_host_memory) {
+  scoped_timer copy_device_2_host_timer;
+  const CUDAContextScope scope(cuda_context);
+  CUDA_API_CALL(cuMemcpyDtoHAsync(dest_host_memory, cuda_denoised_pixels_buffer,
+    denoised_buffer_size_bytes, cuda_stream), THROW_IF_ERROR);
+  CUDA_API_CALL(cuStreamSynchronize(cuda_stream), THROW_IF_ERROR);
+  LOG(INFO) << "time to copy denoised image from device to host: " << copy_device_2_host_timer.elapsed();
 }

 } // cgr_libcluster
--- a/intern/cycles/cluster_rendering/libcluster/denoising/master_optix_denoiser.h
+++ b/intern/cycles/cluster_rendering/libcluster/denoising/master_optix_denoiser.h
@ -8,6 +8,8 @@
  // The macro below is used by Optix SDK and is necessary to avoid DSO loading collision
  // See device_optix.cpp for example.
  #define OPTIX_DONT_INCLUDE_CUDA
+#else
+  #include <cuda.h>
 #endif

 #include <optix_stubs.h>
@ -32,21 +34,27 @@ public:
  static const size_t DEFAULT_IMAGE_WIDTH_PIXELS = 1024;
  static const size_t DEFAULT_IMAGE_HEIGHT_PIXELS = 1024;

-  MasterOptixDenoiser(bool save_denoise_io, int save_every_n_images, const std::string & output_folder_path,
-    bool is_denoising_passes_on, const ImageOutputProvider & image_output_provider, int max_img_width, int max_img_height);
+  MasterOptixDenoiser(CUcontext cuda_context, bool save_denoise_io, int save_every_n_images,
+    const std::string & output_folder_path, bool is_denoising_passes_on,
+    const ImageOutputProvider & image_output_provider, int max_img_width, int max_img_height);
  virtual ~MasterOptixDenoiser();

  virtual DenoisingResult denoise(ServerImage &server_image) override;

  MasterOptixDenoiser(MasterOptixDenoiser const&) = delete;
  void operator=(MasterOptixDenoiser const&) = delete;
+  CUstream getCudaStream() const;
+  CUdeviceptr getCudaMemPointerToDenoisedImage() const;
+
+  void copyDenoisedImageFromCudaToHostMemory(uint8_t * dest_host_memory);

-private:
  // For denoising we always use device # 0
  // rendering can be assigned to devices starting from #1
  // so denoising and rendering do not run on the same device and do not fight for GPU resources
  static const int DEVICE_NUM = 0;

+private:
+
  void allocateCudaBuffer(CUdeviceptr * buffer_ptr, size_t buffer_size, const std::string & buffer_name);
  void releaseCudaBuffers();
  size_t getMaxBufferSize();
@ -54,6 +62,18 @@ private:
    ServerImage &server_image, ImagePixel * input_buffer_host);
  void initOutputOptixImage(OptixImage2D &output_image, ServerImage &server_image);

+  // We set cuda stream explicitly so components which use denoised image down the stream
+  // like pixel conversion and image compression can run within the same cuda stream
+  // and therefore be implicitly synchronized for the best possible performance.
+  // For now we synchronize explicitly by calling cuStreamSynchronize() in the denoise()
+  // so denoise time logging outputs correct denoising duration. Denosing time calculation
+  // and logging needs to be updated upon switching to implicit synchronization by CUDA runtime.
+  // Here is a task for that improvement: T143205954.
+  // Currently cuda stream is owned by this class for simplicity. Alternatively cuda context
+  // can be passed from outside but this requires more changes in ServerImageBuffer and DenoisingContext.
+  // Not making this change at this point to limit the scope to the nvdecoder integration,
+  // this change can be done as part of the task for synchronization improvement mentioned above.
+  CUstream cuda_stream = nullptr;
  CUcontext cuda_context = nullptr;
  OptixDenoiser denoiser = nullptr;
  OptixDenoiserSizes denoiser_sizes;
@ -64,7 +84,11 @@ private:
  CUdeviceptr cuda_pixels_buffer = 0;
  CUdeviceptr cuda_albedo_buffer = 0;
  CUdeviceptr cuda_normal_buffer = 0;
+
+  CUdevice m_cuda_device;
+
  CUdeviceptr cuda_denoised_pixels_buffer = 0;
+  size_t denoised_buffer_size_bytes = 0;
  int max_img_width = DEFAULT_IMAGE_WIDTH_PIXELS;
  int max_img_height = DEFAULT_IMAGE_HEIGHT_PIXELS;
 };
--- a/intern/cycles/cluster_rendering/libcluster/net_camera.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/net_camera.cpp
@ -90,6 +90,7 @@ bool NetCamera::operator==(const NetCamera &other_camera) const
    this->compression_quality == other_camera.compression_quality &&
    this->master_denoiser == other_camera.master_denoiser &&
    this->master_image_color_format == other_camera.master_image_color_format &&
+    this->master_image_compressor == other_camera.master_image_compressor &&
    this->worker_map == other_camera.worker_map;
 }

--- a/intern/cycles/cluster_rendering/libcluster/net_camera.fbs
+++ b/intern/cycles/cluster_rendering/libcluster/net_camera.fbs
@ -3,6 +3,7 @@ namespace cgr_libcluster;
 enum CameraTypeFlatBuffer:byte { CAMERA_PERSPECTIVE = 0, CAMERA_ORTHOGRAPHIC, CAMERA_PANORAMA }
 enum MasterDenoiserFlatBuffer:byte { MASTER_DENOISER_NONE = 0, MASTER_DENOISER_OIDN, MASTER_DENOISER_OPTIX, MASTER_DENOISER_BARCELONA }
 enum MasterImageColorFormatFlatBuffer:byte { MASTER_IMAGE_COLOR_FORMAT_LINEAR = 1, MASTER_IMAGE_COLOR_FORMAT_SRGB = 2}
+enum MasterImageCompressorFlatBuffer:byte { MASTER_IMAGE_COMPRESSOR_JPEG = 1, MASTER_IMAGE_COMPRESSOR_NVENCODER = 2}

 table NetCameraFlatBuffer {
  cam_matrix:TransformFlatBuffer;
@ -28,6 +29,7 @@ table NetCameraFlatBuffer {
  worker_map:WorkerMapFlatBuffer;
  scene_frame:int;
  master_image_color_format:MasterImageColorFormatFlatBuffer = MASTER_IMAGE_COLOR_FORMAT_LINEAR;
+  master_image_compressor:MasterImageCompressorFlatBuffer = MASTER_IMAGE_COMPRESSOR_JPEG;
 }

 table TransformFlatBuffer {
--- a/intern/cycles/cluster_rendering/libcluster/net_camera.h
+++ b/intern/cycles/cluster_rendering/libcluster/net_camera.h
@ -90,6 +90,8 @@ public:
  bool expect_modify_object_message = false;
  ClusterSessionParams::MasterImageColorFormat master_image_color_format =
    ClusterSessionParams::DEFAULT_MASTER_IMAGE_COLOR_FORMAT;
+  ClusterSessionParams::MasterImageCompressor master_image_compressor =
+    ClusterSessionParams::DEFAULT_MASTER_IMAGE_COMPRESSOR;
 };


--- a/intern/cycles/cluster_rendering/libcluster/net_server.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/net_server.cpp
@ -8,6 +8,16 @@
 #include "denoising/denoising_context.h"
 #include "streamed_image.h"

+// Not sure that this is the best place to set OS_LINUX
+// IMO it should be set somewhere on higher level
+// setting it here for now to get system built.
+// TODO: revisit this flag later
+#if defined(linux) || defined(__linux) || defined(__linux__)
+# ifndef OS_LINUX
+#  define OS_LINUX
+# endif
+#endif
+
 namespace cgr_libcluster {

 NetServer::~NetServer()
@ -54,8 +64,8 @@ void NetServer::stop()
      netThread = new std::thread(std::bind(&NetServer::run, this));
    }
  }
-  tbb::mutex::scoped_lock imageStoplock(serverImage.serverImageStopMutex);
-  serverImage.reset();
+  tbb::mutex::scoped_lock imageStoplock(server_image_buffer.serverImageStopMutex);
+  server_image_buffer.reset();
 }

 void NetServer::createStreamedImage(unsigned int readIndex, ServerImage &currentImage, DenoisingContext & denoising_context)
@ -65,7 +75,7 @@ void NetServer::createStreamedImage(unsigned int readIndex, ServerImage &current
 #ifndef WORKER_IMAGE_STREAMING_TEST
    mergeWithWorkers = theClient->mergeWith(currentImage);
 #endif
-    serverImage.createStreamedImage(readIndex, mergeWithWorkers, denoising_context);
+    server_image_buffer.createStreamedImage(readIndex, mergeWithWorkers, denoising_context);
  }
 }

@ -87,6 +97,7 @@ void NetServer::set_save_every_n_images(int save_every_n_images) {

 void NetServer::set_output_folder_path(const std::string & output_folder_path) {
  this->output_folder_path = output_folder_path;
+  server_image_buffer.set_output_folder_path(output_folder_path);
 }

 void NetServer::run()
@ -100,13 +111,15 @@ void NetServer::run()
  for (;;) {
    if (stopped) break;

-    tbb::mutex::scoped_lock imageStopLock(serverImage.serverImageStopMutex);
-    if (!serverImage.isEmpty()) {
-      int readIndex = serverImage.readIndex;
-      LOG(INFO) << "server image indices: " << serverImage.writeIndex << " " << serverImage.readIndex;
-      ServerImage& currentImage = serverImage.readImage();
+    tbb::mutex::scoped_lock imageStopLock(server_image_buffer.serverImageStopMutex);
+    if (!server_image_buffer.isEmpty()) {
+      int readIndex = server_image_buffer.readIndex;
+      LOG(INFO) << "server image indices: " << server_image_buffer.writeIndex << " " << server_image_buffer.readIndex;
+      ServerImage& currentImage = server_image_buffer.readImage();
      //if has client, combine with correct client image before sending
-      if (isMaster())  createStreamedImage(readIndex, currentImage, denoising_context);
+      if (isMaster())  {
+        createStreamedImage(readIndex, currentImage, denoising_context);
+      }

      RPCSend streamSnd(image_socket, &error_func, SERVER_STREAM_IMAGE_CMD);
      if (error_func.have_error()) {
@ -114,7 +127,7 @@ void NetServer::run()
      }
      if (isMaster()) {
        // Create a StreamedImage
-        StreamedImage &streamedImage = serverImage.streamedImageBuffer[readIndex];
+        StreamedImage &streamedImage = server_image_buffer.streamedImageBuffer[readIndex];
        send_streamed_image(streamSnd, streamedImage);
        if (save_streamed_image) {
          if (save_every_n_images <= 0) {
@ -135,12 +148,16 @@ void NetServer::run()
      time_sleep(NET_IMAGE_PAUSE);
    }
  } //endfor
+#if defined(OS_LINUX) && defined(WITH_CUDA)
+  server_image_buffer.deleteNvEncoder();
+#endif
+  VLOG(3) << "Exit NetServer run loop";
 }

 NetServer::NetServer(
   const char *masterAddr, unsigned short masterPort)
    : NetBase(SERVER_PORT, masterAddr),
-      theClient(NULL), serverImage(IMAGE_FRAME_COUNT, true, true),
+      theClient(NULL), server_image_buffer(IMAGE_FRAME_COUNT, true, true),
      masterPort(masterPort), net_camera_command(net_camera, modify_object_message)
 {
  // acceptConnection();
@ -159,7 +176,7 @@ ClusterRenderCommand& NetServer::wait_for_client_command() {
        rcv.read_buffer(&cam, sizeof(cam));
      	LOG(INFO) << "server receive camera for frame: " << cam.frame << " cam_width: " << cam.cam_width << " cam_height: " << cam.cam_height ;
        numCameraReceived++;
-        serverImage.set_master_image_color_format(cam.master_image_color_format);
+        server_image_buffer.set_master_image_color_format(cam.master_image_color_format);
        if(cam.expect_modify_object_message) {
          rcv.read_buffer(&modify_object_message, sizeof(modify_object_message));
          if (theClient) {
@ -180,7 +197,7 @@ ClusterRenderCommand& NetServer::wait_for_client_command() {
          childCam.sampleCount = cam.sampleCount * theClient->getChildrenCount();
          theClient->send_camera(childCam);
        }
-        serverImage.begin_frame(cam);
+        server_image_buffer.begin_frame(cam);
        return net_camera_command;
      } else if (rcv.name == KILL_CMD) {
        LOG(INFO) << "received terminate message...exiting";
@ -200,7 +217,7 @@ bool NetServer::send_tile(const NetRenderTile& rtile)
  //we have to insert NetCamera associated with image
  //before first tile insertion

-  ServerImage* activeImage = serverImage.getActiveImage();
+  ServerImage* activeImage = server_image_buffer.getActiveImage();
  assert(activeImage);
  if (activeImage) {
    //no locking because tile insertion are for different elements in the buffer
@ -208,7 +225,7 @@ bool NetServer::send_tile(const NetRenderTile& rtile)
    if (activeImage->tile_count == getMaxTileCount()) {
      activeImage->imageCount = 1;
      activeImage->sampleCount = activeImage->camera.sampleCount;
-      serverImage.endInsertImage();
+      server_image_buffer.endInsertImage();
    }
    std::cout << "Insert tile progress: "<< activeImage->tile_count << " "<< getMaxTileCount() << "\n";
    return true;
--- a/intern/cycles/cluster_rendering/libcluster/net_server.h
+++ b/intern/cycles/cluster_rendering/libcluster/net_server.h
@ -22,7 +22,7 @@ class NetServer: public NetBase {
  NetClient *theClient; //client of worker servers

  void createStreamedImage(unsigned int index, ServerImage& curImage, DenoisingContext & denoising_context);
-  ServerImageBuffer serverImage;
+  ServerImageBuffer server_image_buffer;

  unsigned short masterPort;

--- a/intern/cycles/cluster_rendering/libcluster/serializer.h
+++ b/intern/cycles/cluster_rendering/libcluster/serializer.h
@ -10,8 +10,8 @@ namespace cgr_libcluster {

  //using namespace ClusterRenderer;
  class NetCamera;
-  class Float4FlatBuffer;
-  class TransformFlatBuffer;
+  struct Float4FlatBuffer;
+  struct TransformFlatBuffer;

 class Serializer {
 public:
@ -23,14 +23,12 @@ public:
    size_t size;
  };

-  // Pass in FlatBuffer otherwise the memory won't exist when the method is exited
-
+  // Pass in FlatBufferBuilder otherwise the memory won't exist when the method is exited
  static Buffer serialize(flatbuffers::FlatBufferBuilder &builder, const NetCamera & net_camera);
  static void deserialize(uint8_t *buffer_pointer, NetCamera &net_camera_out);
 private:

  static void bufferToFloat4(const Float4FlatBuffer * float4_flatbuffer, cgr_libcluster::float4 & float4_out);
-
  static void bufferToTransform(const TransformFlatBuffer * cam_matrix_flatbuffer, cgr_libcluster::Transform & cam_matrix_out);
 };

--- a/intern/cycles/cluster_rendering/libcluster/server_image.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/server_image.cpp
@ -125,6 +125,8 @@ size_t ServerImage::read(RPCReceive &rcv)
  NetCamera net_camera;
  rcv.read_buffer(&net_camera, sizeof(NetCamera));
  size_t buf_size = net_camera.cam_width * net_camera.cam_height * sizeof (ImagePixel);
+  // log line below is used by get_metrics.py to calculate stats. If you change it
+  // please make sure get_metrics.py still works correctly. Update if needed.
  LOG(INFO) << "net camera received for frame: " << net_camera.frame << " " << net_camera.cam_width << " " << net_camera.cam_height;
  rcv.read_buffer(pixels, buf_size);
  LOG(INFO) << "read image for frame: " << net_camera.frame << " with sample: " << net_camera.sampleCount << " of size: " << buf_size;
--- a/intern/cycles/cluster_rendering/libcluster/server_image_buffer.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/server_image_buffer.cpp
@ -1,22 +1,56 @@
+
+#include "net_base.h"             // for NET_IMAGE_TIMEOUT
+
+#include <fstream>              // to save video stream
+
 #include <tbb/parallel_for.h>
+
+#ifdef WITH_CUDA
+#ifdef WITH_CUDA_DYNLOAD
+  #include <cuew.h>
+  // Do not use CUDA SDK headers when using CUEW
+  // The macro below is used by Optix SDK and is necessary to avoid DSO loading collision
+  // See device_optix.cpp for example.
+  #define OPTIX_DONT_INCLUDE_CUDA
+#else
+  #include <cuda.h>
+#endif
+#include "compression/nv_decoder.h"
+#include "compression/nv_encoder.h"
+#endif
+#include "compression/turbojpeg_compressor.h"
+#ifdef WITH_CUDA
+   #include "cuda_context_provider.h"
+#endif
 #include "./utils/timer.h"            // for time_dt
 #include "denoising/denoising_context.h"
-#include "net_base.h"             // for NET_IMAGE_TIMEOUT
+#ifdef WITH_OPTIX
+#include "denoising/master_optix_denoiser.h"
+#endif
 #include "server_image_buffer.h"
 #include "server_image.h"
+#ifdef WITH_CUDA
+   #include "../libcluster_cuda_kernels/gpu_image_utils.h"
+#endif

 namespace cgr_libcluster {

+const string ServerImageBuffer::VIDEO_FULL_FILE_NAME = "master_nvencoded_video_stream.h264";
+
 ServerImageBuffer::ServerImageBuffer(int s, bool hasServerImage, bool hasStreamImage):  imageBuffer(NULL), buffer_size(s)
 {
  if (hasServerImage) imageBuffer = new ServerImage[buffer_size];
  if (hasStreamImage) streamedImageBuffer.resize(buffer_size);
+  turbojpeg_compressor_uptr.reset(new TurbojpegCompressor());
  reset();
 }

 ServerImageBuffer::~ServerImageBuffer()
 {
  if (imageBuffer) delete []imageBuffer;
+#if defined(OS_LINUX) && defined(WITH_CUDA)
+  releaseImageCompressionCudaBuffers();
+#endif
 }

 ServerImage* ServerImageBuffer::getActiveImage() {
@ -29,6 +63,12 @@ ServerImage& ServerImageBuffer::getReadImage() {
  return imageBuffer[readIndex];
 }

+void ServerImageBuffer::set_output_folder_path(const std::string & output_folder_path) {
+  this->output_folder_path = output_folder_path;
+  encoded_videostream_filename = output_folder_path + "/" + VIDEO_FULL_FILE_NAME;
+  VLOG(3) << "Path to the nvencoded video stream file: " << encoded_videostream_filename;
+}
+
 void ServerImageBuffer::reset()
 {
  readIndex = 0;
@ -95,6 +135,7 @@ void ServerImageBuffer::endInsertImage()
 //but I am leaving it fixed for now
 void ServerImageBuffer::parallel_convert(const ServerImage& src, std::vector<cgr_libcluster::uchar3>& dst)
 {
+  scoped_timer pixel_convert_timer;
  int ts = src.getWidth() * src.getHeight();
  int thread_count = PIXEL_THREAD_COUNT;
  int interval = ts/thread_count;
@ -118,6 +159,7 @@ void ServerImageBuffer::parallel_convert(const ServerImage& src, std::vector<cgr
    VLOG(1) << "ERROR. Unknown master image color format. We shuld not get here. Requested color format value: "
 	    << master_image_color_format;
  }
+  LOG(INFO) << "pixel conversion time for frame: " << src.camera.frame << " " << pixel_convert_timer.elapsed();
 }

 void ServerImageBuffer::normalizeImage(ServerImage &server_image) {
@ -134,22 +176,220 @@ void ServerImageBuffer::normalizeImage(ServerImage &server_image) {
  }
 }

-void ServerImageBuffer::createStreamedImage(int index, bool scaled, DenoisingContext & denoising_context)
-{
-  ServerImage &bufferImage = imageBuffer[index];
-  StreamedImage &streamedImage = streamedImageBuffer[index];
-  streamedImage.initImage(bufferImage.camera);
-  bufferImage.denoised = false;
-  if(!scaled) {
-    normalizeImage(bufferImage);
+void ServerImageBuffer::writeImageToVideoStreamFile(
+    const std::vector<uint8_t> &encoded_image) const {
+  if(encoded_videostream_filename.length() < 1) {
+    return;
+  }
+  std::ofstream video_stream_file;
+  video_stream_file.open(encoded_videostream_filename, std::ios::app | std::ios::binary);
+  if(video_stream_file.is_open()) {
+    video_stream_file.write(reinterpret_cast<const char*>(encoded_image.data()), encoded_image.size());
+    VLOG(3) << "Wrote encoded image of size: " << encoded_image.size();
+    video_stream_file.close();
+  } else {
+    std::string message = "FATAL. Unable to open video stream output file: " + encoded_videostream_filename;
+    throw std::invalid_argument(message);
+  }
 }

-  if(bufferImage.camera.master_denoiser) {
-    denoiseImage(bufferImage, denoising_context);
+#if defined(OS_LINUX) && defined(WITH_CUDA)
+
+void ServerImageBuffer::allocateCudaBuffer(CUdeviceptr * buffer_ptr, size_t buffer_size, const std::string & buffer_name) {
+  LOG(INFO) << "Allocating cuda memory for: " << buffer_name;
+  CUresult cu_result = cuMemAlloc(buffer_ptr, buffer_size);
+  if(cu_result != CUDA_SUCCESS) {
+    std::string message = "Error. Could not allocate memory for " + buffer_name + " on device. Cuda error code: "
+      + std::to_string(cu_result);
+    throw std::runtime_error(message);
+  }
+}
+
+void ServerImageBuffer::resetNvEncoder(const ServerImage &server_image) {
+  VLOG(3) << "Resetting NvEncoder";
+  CUcontext cuda_context = CudaContextProvider::getPrimaryContext(CUDA_DEVICE_NUM);
+  const CUDAContextScope scope(cuda_context);
+  // Renderer, optix denoiser and nvencoder share the same cuda context, call cuCtxSynchronize
+  // to make sure all operations on cuda context are completed before we start creating NvEncoder
+  // not doing this may lead to sporadical problems like nvEncDestroyEncoder call hangs
+  cuCtxSynchronize();
+  nv_encoder_uptr.reset(new NvEncoder(
+    NV_ENC_BUFFER_FORMAT_NV12,
+    cuda_context,
+    server_image.getWidth(),
+    server_image.getHeight()));
+  VLOG(3) << "Created NvEncoder successfully";
+  releaseImageCompressionCudaBuffers();
+  allocateImageCompressionCudaBuffers(server_image);
+  VLOG(3) << "Resetting NvEncoder done";
+}
+
+void ServerImageBuffer::copyServerImageToGpuMemory(const ServerImage & server_image, CUdeviceptr & linear_image_buffer_gpu) {
+  CUcontext cuda_context = CudaContextProvider::getPrimaryContext(CUDA_DEVICE_NUM);
+  const CUDAContextScope scope(cuda_context);
+  uint8_t* linear_image_buffer_cpu = server_image.denoised ? (uint8_t*)server_image.denoised_pixels :
+    (uint8_t*)server_image.pixels;
+  CUresult cu_result = cuMemcpyHtoD(linear_image_buffer_gpu, linear_image_buffer_cpu, server_image.getBufferSize());
+  if(cu_result != CUDA_SUCCESS) {
+    std::string message = "Error. Could not copy input buffer from host to device memory. CUDA error code:  "
+      + std::to_string(cu_result);
+    throw std::runtime_error(message);
+  }
+}
+
+void ServerImageBuffer::allocateImageCompressionCudaBuffers(const ServerImage &server_image) {
+  const size_t num_pixels = server_image.getImageSize();
+  // 1 byte per color channel
+  const int nvencoder_yuv_nv12_buffer_size = num_pixels + (num_pixels)/2;
+  allocateCudaBuffer(&nvencoder_input_buffer_gpu, nvencoder_yuv_nv12_buffer_size,
+    "input cuda buffer for nvencoder");
+  allocateCudaBuffer(&linear_image_buffer_gpu, server_image.getBufferSize(),
+    "cuda buffer to accept linear images from CPU memory");
+}
+
+void ServerImageBuffer::releaseImageCompressionCudaBuffers() {
+  CUresult cu_result = cuMemFree(nvencoder_input_buffer_gpu);
+  if(cu_result != CUDA_SUCCESS) {
+    "Error. cuMemFree failed for nvencoder_input_buffer_gpu with error code:" + std::to_string(cu_result);
+  }
+  nvencoder_input_buffer_gpu = 0;
+
+  cu_result = cuMemFree(linear_image_buffer_gpu);
+  if(cu_result != CUDA_SUCCESS) {
+    "Error. cuMemFree failed for linear_image_buffer_gpu with error code:" + std::to_string(cu_result);
+  }
+  linear_image_buffer_gpu = 0;
+}
+
+void ServerImageBuffer::deleteNvEncoder() {
+  nv_encoder_uptr.reset(nullptr);
+}
+
+#endif // WITH_CUDA
+
+void ServerImageBuffer::createStreamedImage(int index, bool scaled, DenoisingContext & denoising_context)
+{
+  ServerImage &server_image = imageBuffer[index];
+  StreamedImage &streamed_image = streamedImageBuffer[index];
+  streamed_image.initImage(server_image.camera);
+  server_image.denoised = false;
+  if(!scaled) {
+    normalizeImage(server_image);
+  }
+  float sampleCount = server_image.sampleCount;
+
+#if defined(OS_LINUX) && defined(WITH_CUDA)
+  const bool use_nvencoder_for_compression = server_image.camera.master_image_compressor ==
+    ClusterSessionParams::MASTER_IMAGE_COMPRESSOR_NVENCODER;
+  if(use_nvencoder_for_compression) {
+    if(!nv_encoder_uptr || server_image.camera.frame == 0) {
+      resetNvEncoder(server_image);
+    }
+  }
+#else
+  if(server_image.camera.master_image_compressor == ClusterSessionParams::MASTER_IMAGE_COMPRESSOR_NVENCODER) {
+    throw std::runtime_error("ERROR. NVENCODER compressor is requested. Server is compiled without CUDA support\
+ so has no nvencoder and can not encode images with nvencoder.\
+ Recompile with CUDA or use JPEG compressor instead. Terminating.");
+  }
+#endif
+
+  if(server_image.camera.master_denoiser) {
+    bool denoising_ok = denoiseImage(server_image, denoising_context);
+    if(!denoising_ok) {
+      throw std::runtime_error("Image denoising failed.");
+    }
+  }
+
+  MasterDenoiser * master_denoiser = denoising_context.getDenoiser(server_image.camera.master_denoiser);
+  const bool image_is_denoised_by_optix_denoiser = master_denoiser &&
+    master_denoiser->getType() == ClusterSessionParams::MasterDenoiser::MASTER_DENOISER_OPTIX;
+
+#ifdef WITH_OPTIX
+  MasterOptixDenoiser* optix_denoiser = nullptr;
+  if(image_is_denoised_by_optix_denoiser) {
+    optix_denoiser = dynamic_cast<MasterOptixDenoiser*>(master_denoiser);
+  }
+#endif
+
+  scoped_timer compression_timer;
+#if defined(OS_LINUX) && defined(WITH_CUDA)
+  if(use_nvencoder_for_compression) {
+  #ifdef WITH_OPTIX
+    compressWithNvencoder(server_image, streamed_image, optix_denoiser);
+  #else
+    compressWithNvencoder(server_image, streamed_image);
+  #endif
+  } else {
+  #ifdef WITH_OPTIX
+    compressWithTurbojpeg(server_image, streamed_image, optix_denoiser);
+  #else
+    compressWithTurbojpeg(server_image, streamed_image);
+  #endif
+  }
+#else
+  #ifdef WITH_OPTIX
+    compressWithTurbojpeg(server_image, streamed_image, optix_denoiser);
+  #else
+    compressWithTurbojpeg(server_image, streamed_image);
+  #endif
+#endif
+
+  double compression_time = compression_timer.elapsed();
+  LOG(INFO) << "compression time for frame: " << server_image.camera.frame << " " << compression_time <<
+    " (includes pixel conversion time)";
+  }
+
+#if defined(OS_LINUX) && defined(WITH_CUDA)
+#ifdef WITH_OPTIX
+void ServerImageBuffer::compressWithNvencoder(ServerImage &server_image, StreamedImage &streamed_image,
+    MasterOptixDenoiser* optix_denoiser) {
+#else
+void ServerImageBuffer::compressWithNvencoder(ServerImage &server_image, StreamedImage &streamed_image) {
+#endif
+  CUstream cuda_stream = nullptr;
+  CUdeviceptr image_to_compress_gpu_ptr = 0;
+#ifdef WITH_OPTIX
+  if(optix_denoiser) {
+    cuda_stream = optix_denoiser->getCudaStream();
+    image_to_compress_gpu_ptr = optix_denoiser->getCudaMemPointerToDenoisedImage();
+  } else
+#endif
+  {
+    copyServerImageToGpuMemory(server_image, linear_image_buffer_gpu);
+    image_to_compress_gpu_ptr = linear_image_buffer_gpu;
+  }
+  const bool useSrgbColorSpace =
+    master_image_color_format == ClusterSessionParams::MasterImageColorFormat::MASTER_IMAGE_COLOR_FORMAT_SRGB;
+  gpuRawRgbToGammaCorrectedYuvNv12(image_to_compress_gpu_ptr, server_image.getWidth(),
+    server_image.getHeight(), useSrgbColorSpace, cuda_stream, nvencoder_input_buffer_gpu);
+  nv_encoder_uptr->encode(nvencoder_input_buffer_gpu, streamed_image.getCompressedImage());
+}
+#endif // WITH_CUDA
+
+#ifdef WITH_OPTIX
+void ServerImageBuffer::compressWithTurbojpeg(ServerImage &server_image, StreamedImage &streamed_image,
+    MasterOptixDenoiser* optix_denoiser) {
+#else
+void ServerImageBuffer::compressWithTurbojpeg(ServerImage &server_image, StreamedImage &streamed_image) {
+#endif
+#ifdef WITH_OPTIX
+  if(optix_denoiser) {
+    optix_denoiser->copyDenoisedImageFromCudaToHostMemory(
+      (uint8_t*)server_image.denoised_pixels);
+    server_image.denoised = true;
+  }
+#endif
+  parallel_convert(server_image, streamed_image.getByteBuffer());
+  uint8_t *jpeg_data = NULL;
+  size_t buf_size = turbojpeg_compressor_uptr->compress(streamed_image.getByteBuffer().data(),
+    server_image.getWidth(), server_image.getHeight(),
+    server_image.camera.compression_quality,
+    jpeg_data);
+  if (jpeg_data) {
+    streamed_image.copyInCompressedImage(jpeg_data, buf_size);
+    turbojpeg_compressor_uptr->free(jpeg_data);
  }
-  scoped_timer pixel_convert_timer;
-  parallel_convert(bufferImage, streamedImage.getByteBuffer());
-  LOG(INFO) << "pixel conversion time for frame: " << bufferImage.camera.frame << " " << pixel_convert_timer.elapsed();
 }

 bool ServerImageBuffer::denoiseImage(ServerImage &bufferImage, DenoisingContext & denoising_context) {
@ -162,7 +402,22 @@ bool ServerImageBuffer::denoiseImage(ServerImage &bufferImage, DenoisingContext
  if(result == DenoisingResult::OK) {
    return true;
  }
-
+#ifdef WITH_OPTIX
+  else if (result == DenoisingResult::IMAGE_TOO_BIG &&
+      master_denoiser->getType() == ClusterSessionParams::MasterDenoiser::MASTER_DENOISER_OPTIX) {
+    bool replacement_ok = denoising_context.replaceOptixDenoiser(bufferImage.getWidth(), bufferImage.getHeight());
+    if(replacement_ok) {
+      master_denoiser = denoising_context.getDenoiser(bufferImage.camera.master_denoiser);
+      if(master_denoiser) {
+        return master_denoiser->denoise(bufferImage) == DenoisingResult::OK;
+      } else {
+        LOG(WARNING) << "WARNING. No denoiser instance of requested type: " << bufferImage.camera.master_denoiser;
+      }
+    } else {
+      LOG(ERROR) << "ERROR. replaceOptixDenoiser failed, image will not be denoised";
+    }
+  }
+#endif
  return false;
 }

@ -209,19 +464,43 @@ int ServerImageBuffer::add_streamed_image(RPCReceive &rcv,
  }
  if (!isFull() ) {
    int frameIndex = writeIndex;
-    StreamedImage &writeImage = streamedImageBuffer[frameIndex];
-    writeImage.read(rcv);
+    StreamedImage &streamed_image = streamedImageBuffer[frameIndex];
+    streamed_image.read(rcv);
+    scoped_timer decompress_timer;
+#if defined(OS_LINUX) && defined(WITH_CUDA)
+    const bool use_nvdecoder_for_decompression = streamed_image.getNetCamera().master_image_compressor ==
+      ClusterSessionParams::MASTER_IMAGE_COMPRESSOR_NVENCODER;
+    if(use_nvdecoder_for_decompression) {
+      if(!nv_decoder_uptr || streamed_image.getFrame() == 0) {
+        VLOG(3) << "Creating nvdecoder";
+          CUcontext cuda_context = CudaContextProvider::getPrimaryContext(CUDA_DEVICE_NUM);
+          const CUDAContextScope scope(cuda_context);
+          nv_decoder_uptr.reset(new NvDecoder(cuda_context));
+      }
+      nv_decoder_uptr->decode(streamed_image.getCompressedImage(),
+        streamed_image.getFrame(), &streamed_image.getRgbImageBuffer());
+    } else
+#endif
+    {
+      int width = 0, height = 0;
+      streamed_image.getImage(width, height);
+      if (!turbojpeg_compressor_uptr->decompress(streamed_image.getCompressedImage(), width, height,
+          streamed_image.getRgbImageBuffer())) {
+        LOG(ERROR) << "jpeg decompression failed";
+      }
+    }
+    LOG(INFO) << "decompress image for frame: " << streamed_image.getFrame() << " " << decompress_timer.elapsed();
    if (save_streamed_image) {
-      if (!writeImage.saveImage(output_folder_path, "client_streamed_image_rgb_", frame_id)) {
+      if (!streamed_image.saveImage(output_folder_path, "client_streamed_image_rgb_", frame_id)) {
        LOG(ERROR) << "failed to save streamed image";
      }
    }
    incWriteIndex();
-    LOG(INFO) << "thin client insert master image frame: "<< writeImage.getFrame();
+    LOG(INFO) << "thin client insert master image frame: "<< streamed_image.getFrame();
    return frameIndex;
  } else {
-    StreamedImage writeImage;
-    writeImage.read(rcv);
+    StreamedImage streamed_image;
+    streamed_image.read(rcv);
    return -1;
  }
 }
--- a/intern/cycles/cluster_rendering/libcluster/server_image_buffer.h
+++ b/intern/cycles/cluster_rendering/libcluster/server_image_buffer.h
@ -11,16 +11,33 @@
 #define IMAGE_FRAME_COUNT 10
 #define PIXEL_THREAD_COUNT 60

+// Not sure that this is the best place to set OS_LINUX
+// IMO it should be set somewhere on higher level
+// setting it here for now to get system built.
+// TODO: revisit this flag later
+#if defined(linux) || defined(__linux) || defined(__linux__)
+# ifndef OS_LINUX
+#  define OS_LINUX
+# endif
+#endif
+
 namespace cgr_libcluster {

+using std::string;
+
 class Camera;
+class MasterOptixDenoiser;
 class NetCamera;
+#if defined(OS_LINUX) && defined(WITH_CUDA)
+class NvEncoder;
+class NvDecoder;
+#endif
 class RPCSend;
 class RPCReceive;
 class PathTraceDisplay;
 class ServerImage;
 class DenoisingContext;
-
+class TurbojpegCompressor;

 class ServerImageBuffer {
 public:
@ -32,20 +49,16 @@ public:
  bool isEmpty();
  bool isFull();
  ServerImage& getReadImage();
-
-
  ServerImage* getActiveImage();
-
  ServerImage* beginInsertImage(NetCamera&);
  void endInsertImage();
  bool begin_frame(NetCamera&);
-
  int add_frame(RPCReceive& rcv, std::atomic<bool>& stopped);
  int add_streamed_image(RPCReceive& rcv, std::atomic<bool>& stopped,
                         bool save_streamed_image, const std::string &output_folder_path, int frame_id);
  bool wait_for_streamed_image(std::atomic<bool>& stopped );
  const StreamedImage* get_streamed_image();
-
+  void set_output_folder_path(const std::string & output_folder_path);

  void reset();

@ -57,6 +70,9 @@ public:
  void set_master_image_color_format(ClusterSessionParams::MasterImageColorFormat master_image_color_format_in) {
    master_image_color_format = master_image_color_format_in;
  }
+#if defined(OS_LINUX) && defined(WITH_CUDA)
+  void deleteNvEncoder();
+#endif

  //only accessed by insertion thread

@ -72,6 +88,9 @@ public:
  tbb::mutex serverImageStopMutex;

 private:
+  static const int CUDA_DEVICE_NUM = 0;
+  static const string VIDEO_FULL_FILE_NAME;
+
  bool denoiseImage(ServerImage &bufferImage, DenoisingContext & denoising_context);
  // In the master-workers configuration (which is target production setup) images are normalised
  // upon arriving at the master. It's done in different threads what benefits from the
@ -82,6 +101,55 @@ private:
  void normalizeImage(ServerImage &server_image);
  ClusterSessionParams::MasterImageColorFormat master_image_color_format =
    ClusterSessionParams::DEFAULT_MASTER_IMAGE_COLOR_FORMAT;
+
+#ifdef WITH_CUDA
+  void allocateCudaBuffer(long long unsigned int * buffer_ptr, size_t buffer_size,
+    const std::string & buffer_name);
+  void allocateImageCompressionCudaBuffers(const ServerImage &server_image);
+  void releaseImageCompressionCudaBuffers();
+  void resetNvEncoder(const ServerImage &server_image);
+
+  void copyServerImageToGpuMemory(const ServerImage &server_image, long long unsigned int  &linear_image_buffer_gpu);
+#endif // WITH_CUDA
+  void writeImageToVideoStreamFile(const std::vector<uint8_t> &encoded_image) const;
+
+#ifdef WITH_CUDA
+  #ifdef WITH_OPTIX
+    void compressWithNvencoder(ServerImage &server_image, StreamedImage &streamed_image, MasterOptixDenoiser* optix_denoiser);
+    void compressWithTurbojpeg(ServerImage &server_image, StreamedImage &streamed_image, MasterOptixDenoiser* optix_denoiser);
+  #else
+    void compressWithNvencoder(ServerImage &server_image, StreamedImage &streamed_image);
+    void compressWithTurbojpeg(ServerImage &server_image, StreamedImage &streamed_image);
+  #endif // WITH_OPTIX
+#else
+  void compressWithTurbojpeg(ServerImage &server_image, StreamedImage &streamed_image);
+#endif // WITH_CUDA
+
+  std::string output_folder_path;
+  std::string encoded_videostream_filename;
+
+  bool image_buffers_allocated = false;
+
+  // Use long long unsigned int instead of CUdeviceptr since cuda headers are not included
+  // in this header file due to server_image_buffer.h is included from the cycles/session/buffers.cpp
+  // via libcluster/net_server.h but cycles/session unaware about cuda so compilation fails.
+  // To minimize changes to the cycles/session and keep it cuda independent cuda headers are
+  // included in the server_image_buffer.cpp
+  // We may come up with better incapsulation and libcluster interface towards the Blender code.
+  long long unsigned int nvencoder_input_buffer_gpu = 0;
+
+  // When rendered image is denoised by OIDN denoiser or is not denoised at all
+  // it's hosted in the CPU memory.
+  // In these cases we copy such images into this CUDA memory when nvencoder is used for the image compression
+  // so we can do gamma correction and rgb to yuv conversion before passing the image to the nvencoder.
+  long long unsigned int linear_image_buffer_gpu = 0;
+
+  // image compressors
+#if defined(OS_LINUX) && defined(WITH_CUDA)
+  std::unique_ptr<NvEncoder> nv_encoder_uptr;
+  std::unique_ptr<NvDecoder> nv_decoder_uptr;
+#endif
+  std::unique_ptr<TurbojpegCompressor> turbojpeg_compressor_uptr;
 };

 } // cgr_libcluster
--- a/intern/cycles/cluster_rendering/libcluster/streamed_image.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/streamed_image.cpp
@ -1,15 +1,41 @@
 #include "image_io_util.h"
-#include <turbojpeg.h>    // for tjFree
 #include "./utils/timer.h"    // for scoped_timer
 #include "./utils/logging.h"
 #include "streamed_image.h"
 #include "net_simple.h"

+#include "compression/turbojpeg_compressor.h"
+
 namespace cgr_libcluster {

 using OpenImageIO_v2_4::ImageOutput;
 using OpenImageIO_v2_4::TypeDesc;

+StreamedImage::StreamedImage() : allocated(false), w(0), h(0),
+    jpeg_compressor_uptr(new TurbojpegCompressor()) {
+}
+
+StreamedImage::~StreamedImage(){
+}
+
+StreamedImage::StreamedImage(const StreamedImage& ) :
+    jpeg_compressor_uptr(new TurbojpegCompressor()) {
+}
+
+StreamedImage::StreamedImage( StreamedImage&& ) :
+    jpeg_compressor_uptr(new TurbojpegCompressor()) {
+}
+
+StreamedImage& StreamedImage::operator=(const StreamedImage& ) {
+  jpeg_compressor_uptr.reset(new TurbojpegCompressor());
+  return *this;
+}
+
+StreamedImage& StreamedImage::operator=(StreamedImage&& ) {
+  jpeg_compressor_uptr.reset(new TurbojpegCompressor());
+  return *this;
+}
+
 size_t StreamedImage::read(RPCReceive &rcv)
 {
  size_t buf_size = 0;
@ -18,163 +44,47 @@ size_t StreamedImage::read(RPCReceive &rcv)

  NetCamera net_camera;
  rcv.read_buffer(&net_camera, sizeof(NetCamera));
+  // log line below is used by get_metrics.py to calculate stats. If you change it
+  // please make sure get_metrics.py still works correctly. Update if needed.
  LOG(INFO) << "net camera received for frame: " << net_camera.frame << " " << net_camera.cam_width << " " << net_camera.cam_height;

  initImage(net_camera);
-  std::vector<uint8_t> cbuffer(buf_size);
-  rcv.read_buffer(reinterpret_cast<char *>(cbuffer.data()), buf_size);
-  LOG(INFO) << "read image for frame: " << camera.frame << " with sample: " << camera.sampleCount << " size: " << buf_size;
-  scoped_timer dec_timer;
-  if (!decompress(cbuffer)) {
-    buf_size = 0;
+  if(compressed_image.size() != buf_size) {
+    compressed_image.resize(buf_size);
  }
-  LOG(INFO) << "decompress image for frame: " << camera.frame << " " << dec_timer.elapsed();
+  rcv.read_buffer(reinterpret_cast<char *>(compressed_image.data()), buf_size);
+  LOG(INFO) << "read image for frame: " << camera.frame << " with sample: " << camera.sampleCount << " size: " << buf_size;
  return buf_size;
 }

 size_t StreamedImage::write(RPCSend &snd)
 {
-  scoped_timer compression_timer;
-  uint8_t *jpeg_data = NULL;
-  size_t buf_size = compress(jpeg_data);
-  LOG(INFO) << "compression time for frame: " << camera.frame << " " << compression_timer.elapsed();
-  if (buf_size > 0) {
-    // get_metrics.py script depends on formatting of this log line, it you change it,
+  const size_t compressed_image_size = compressed_image.size();
+  if (compressed_image_size > 0) {
+    // get_metrics.py script depends on formatting of this log line, if you change it,
    // please make sure that script still works or update it accordingly
-    LOG(INFO) << "stream image for frame: " << camera.frame << " with sample " << camera.sampleCount << " quality: " << camera.compression_quality << " buffer size " << buf_size + sizeof(buf_size) + sizeof(NetCamera);
+    LOG(INFO) << "stream image for frame: " << camera.frame << " with sample " << camera.sampleCount << " quality: " <<
+      camera.compression_quality << " buffer size " << compressed_image_size + sizeof(compressed_image_size) + sizeof(NetCamera);
    snd.write();
-    snd.write_buffer(&buf_size, sizeof(buf_size));
+    snd.write_buffer(&compressed_image_size, sizeof(compressed_image_size));
    snd.write_buffer(&camera, sizeof (NetCamera) );
-    snd.write_buffer(jpeg_data, buf_size);
+    snd.write_buffer(compressed_image.data(), compressed_image_size);
  }
-  if (jpeg_data) tjFree(jpeg_data);
-  return buf_size;
+  return compressed_image_size;
 }

-#ifdef WITH_WEBRTC
-size_t StreamedImage::write(cgr_streaming::WebrtcPeer *webrtcPeer)
-{
-  if (webrtcPeer == nullptr) {
-    LOG(ERROR) << "webrtcPeer is nullptr";
-    return 0;
-  }
-  size_t buf_size = getBufferSize();
-  if (buf_size > 0) {
-    // TODO(fangy): Might need to work with pmishchuk@ to update the LOG line below so get_metrics.py works when WebRTC is used.
-    // get_metrics.py script depends on formatting of this log line, it you change it,
-    // please make sure that script still works or update it accordingly
-    LOG(INFO) << "stream image for frame: " << camera.frame << " with sample " << camera.sampleCount << " quality: " << camera.compression_quality << " buffer size " << buf_size + sizeof(buf_size) + sizeof(NetCamera) << " using WebRTC";
-    int width = 0, height = 0;
-    void *image_buffer = getImage(width, height);
-    LOG(INFO) << "WebRTC sendFrame id " << getFrame() << " width " << width << " height " << height;
-    scoped_timer send_frame_timer;
-    cgr_streaming::WebrtcFrame frame(getFrame(), static_cast<const uint8_t*>(image_buffer), width, height);
-    if (!webrtcPeer->sendFrame(frame)) {
-      LOG(ERROR) << "failed to send frame via WebRTC";
-    }
-    LOG(INFO) << "send frame time for frame number: " << camera.frame << " " << send_frame_timer.elapsed();
-  }
-  return buf_size;
-}
-#endif
-
-
-//#define TIME_JPEG
-size_t StreamedImage::compress(unsigned char* &jpeg_image)
-{
-    // Convert buffer to unsigned char * 3 channels
-    const int subsampling = TJSAMP_444;
-    size_t jpeg_length = 0; // tjCompress2 will allocate the jpeg_image buffer
-    jpeg_image = nullptr;
-
-#ifdef TIME_JPEG
-    struct timespec start_time, end_time;
-    clock_gettime(CLOCK_MONOTONIC, &start_time);
-#endif
-    tjhandle jpeg_compressor = tjInitCompress();
-    if (jpeg_compressor == nullptr) {
-      LOG(ERROR) << "Cannot initialize JPEG compressor";
-      return 0;
-    }
-    void* src_buffer = byte_buffer.data();
-    int jpeg_error = tjCompress2(jpeg_compressor,
-                                 (unsigned char*) src_buffer,
-                                 w,
-                                 0,
-                                 h,
-                                 TJPF_RGB,
-                                 &jpeg_image,
-                                 (unsigned long *)&jpeg_length,
-                                 subsampling,
-                                 camera.compression_quality,
-                                 TJFLAG_FASTDCT);
-    tjDestroy(jpeg_compressor);
-    if (jpeg_error < 0) {
-      const char *jpeg_error_str = tjGetErrorStr();
-      LOG(ERROR) << "JPEG compression error: " << jpeg_error_str;
-      return 0;
+void StreamedImage::copyInCompressedImage(const uint8_t * compressed_image_ptr, const size_t size_in_bytes) {
+  // for uint8_t the number_of_elements is the same as size_in_bytes
+  const size_t number_of_elements = size_in_bytes;
+  compressed_image.assign(compressed_image_ptr, compressed_image_ptr + number_of_elements);
 }

-#ifdef TIME_JPEG
-    clock_gettime(CLOCK_MONOTONIC, &end_time);
-    // ms time
-    double elapsed_time = (end_time.tv_nsec - start_time.tv_nsec) / 1e6;
-    LOG(INFO) << "TIMING: JPEG compression: " << elapsed_time << "ms"
-        << ", resolution " << w << "x" << h
-        << ", sizes " << src_buffer.size() << " (" << jpeg_length << ")";
-#endif
-    return jpeg_length;
+std::vector<uint8_t>& StreamedImage::getCompressedImage() {
+  return compressed_image;
 }

-bool StreamedImage::decompress(std::vector<uint8_t> &cbuffer)
-{
-#ifdef TIME_JPEG
-    struct timespec start_time, end_time;
-    clock_gettime(CLOCK_MONOTONIC, &start_time);
-#endif
-    /* Use TurboJPEG to decompress the buffer */
-    int subsampling = 0;
-    tjhandle jpeg_decompressor = tjInitDecompress();
-    if (jpeg_decompressor == nullptr) {
-      LOG(ERROR) << "Cannot initialize JPEG decompressor";
-      return false;
-    }
-    int jpeg_error = tjDecompressHeader2(jpeg_decompressor, cbuffer.data(),
-            cbuffer.size(), &w, &h, &subsampling);
-    if (jpeg_error < 0) {
-      LOG(ERROR) << "Cannot decode JPEG header from StreamedImage";
-      tjDestroy(jpeg_decompressor);
-      return false;
-    }
-
-    std::vector<unsigned char> dst_buffer(w * h * 3);
-    //void *dst_buffer = byte_buffer.data();
-    jpeg_error = tjDecompress2(jpeg_decompressor,
-                               cbuffer.data(),
-                               cbuffer.size(),
-                               (unsigned char*) dst_buffer.data(),
-                               w,
-                               0,
-                               h,
-                               TJPF_RGB,
-                               TJFLAG_ACCURATEDCT);
-    tjDestroy(jpeg_decompressor);
-    if (jpeg_error < 0) {
-      const char *jpeg_error_str = tjGetErrorStr();
-      LOG(ERROR) << "JPEG decompression error" << jpeg_error_str;
-      return false;
-    }
-    putImage(w, h, dst_buffer.data());
-#ifdef TIME_JPEG
-    clock_gettime(CLOCK_MONOTONIC, &end_time);
-    // ms time
-    double elapsed_time = (end_time.tv_nsec - start_time.tv_nsec) / 1e6;
-    LOG(INFO) << "TIMING: JPEG decompression: " << elapsed_time << "ms"
-        << ", resolution " << w << "x" << h
-        << ", sizes " << cbuffer.size() << " (" << dst_buffer.size() << ")";
-#endif
-
-    return true;
+std::vector<cgr_libcluster::uchar3>& StreamedImage::getByteBuffer() {
+  return byte_buffer;
 }

 bool StreamedImage::saveImage(const std::string &output_folder_path, const std::string &file_name_prefix, int frame_id) {
@ -183,4 +93,12 @@ bool StreamedImage::saveImage(const std::string &output_folder_path, const std::
  return ImageIOUtil::saveFrame(file_path, TypeDesc::UCHAR, image_output.get(),getByteBuffer().data(), w, h);
 }

+std::vector<cgr_libcluster::uchar3>& StreamedImage::getRgbImageBuffer() {
+  return byte_buffer;
+}
+
+NetCamera& StreamedImage::getNetCamera() {
+  return camera;
+}
+
 } // cgr_libcluster
--- a/intern/cycles/cluster_rendering/libcluster/streamed_image.h
+++ b/intern/cycles/cluster_rendering/libcluster/streamed_image.h
@ -10,29 +10,26 @@ namespace cgr_libcluster {
 class PathTraceDisplay;
 class RPCSend;
 class RPCReceive;
+class TurbojpegCompressor;

 class StreamedImage {
 public:
-  StreamedImage() :  allocated(false), w(0), h(0) {}
-  ~StreamedImage() {}
-  void putImage(int width, int height, const void *image)
-  {
-    if (w < width || h  < height) {
-      w = width;
-      h = height;
-      byte_buffer.resize(w*h);
-    }
-    w = width;
-    h = height;
-    memcpy(byte_buffer.data(), image, sizeof(cgr_libcluster::uchar3) * width * height);
-  }
+  StreamedImage();
+  ~StreamedImage();

+  StreamedImage(const StreamedImage& );
+  StreamedImage(StreamedImage&& );
+  StreamedImage& operator=(const StreamedImage& );
+  StreamedImage& operator=(StreamedImage&& );
+
+public:
  const void* getImage(int &width, int &height) const
  {
    width = w;
    height = h;
    return byte_buffer.data();
  }
+
  void initImage(NetCamera &cam) {
      camera = cam;
      if (allocated) {
@ -48,26 +45,28 @@ class StreamedImage {
  size_t getBufferSize() const { return w * h *  sizeof(cgr_libcluster::uchar3); }
  size_t read(RPCReceive &rcv);
  size_t write(RPCSend &snd);
-#ifdef WITH_WEBRTC
-  size_t write(cgr_streaming::WebrtcPeer *webrtcPeer);
-#endif
  int getFrame() const { return camera.frame; }

-  // Compress the StreamedImage into a jpeg stream stored in cbuffer
-  size_t compress(unsigned char*&) ;
-  // Decompress a jpeg stream cbuffer into the StreamedImage
-  bool decompress(std::vector<uint8_t> &cbuffer);
-
  bool saveImage(const std::string &output_folder_path, const std::string &file_name_prefix, int frame_id);

-  std::vector<cgr_libcluster::uchar3>& getByteBuffer() {return byte_buffer; }
+  void copyInCompressedImage(const uint8_t * compressed_image_ptr, const size_t size_in_bytes);
+  std::vector<cgr_libcluster::uchar3>& getByteBuffer();
+  std::vector<uint8_t>& getCompressedImage();
+  std::vector<cgr_libcluster::uchar3>& getRgbImageBuffer();
+  NetCamera& getNetCamera();

 private:
  bool allocated;
  int w;
  int h;
-  std::vector<cgr_libcluster::uchar3> byte_buffer;
+  std::vector<cgr_libcluster::uchar3> byte_buffer; // raw image as rgb
+  std::vector<uint8_t> compressed_image;
+
  NetCamera camera;
+  // Have TurbojpegCompressor as pointer here so we can use forward declaration
+  // and avoid including TurboJpeg header in this header and minimize changes in the Blender code
+  // which includes this header but is not aware about TurboJpeg
+  std::unique_ptr<TurbojpegCompressor> jpeg_compressor_uptr;
 };

 } // cgr_libcluster
--- a/intern/cycles/cluster_rendering/libcluster/test/CMakeLists.txt
+++ b/intern/cycles/cluster_rendering/libcluster/test/CMakeLists.txt
@ -34,12 +34,14 @@ set(LIBRARIES

 add_definitions(-DWITH_CYCLES_LOGGING)

+if(WITH_CYCLES_DEVICE_CUDA)
  if(WITH_CUDA_DYNLOAD)
    list(APPEND LIBRARIES extern_cuew)
    add_definitions(-DWITH_CUDA_DYNLOAD)
  else()
    list(APPEND LIBRARIES ${CUDA_CUDA_LIBRARY})
  endif()
+endif()

 if(WITH_CYCLES_LOGGING)
  list(APPEND LIBRARIES
--- a/intern/cycles/cluster_rendering/libcluster/test/denoising_context_test.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/test/denoising_context_test.cpp
@ -1,7 +1,9 @@
 #include "testing/testing.h"
 #include "gmock/gmock.h"
+#ifdef WITH_OPTIX
 #include <optix_stubs.h>
 #include <optix_function_table_definition.h>
+#endif

 #include "mocks.h"
 #include "denoising/denoising_context.h"
--- a/intern/cycles/cluster_rendering/libcluster/utils/cuda_utils.h
+++ b/intern/cycles/cluster_rendering/libcluster/utils/cuda_utils.h
@ -0,0 +1,46 @@
+#ifndef __CUDA_UTILS_H__
+#define __CUDA_UTILS_H__
+
+namespace cgr_libcluster {
+
+#define THROW_IF_ERROR true
+#define DO_NOT_THROW false
+
+#define CUDA_API_CALL(cuda_api, throw_if_error)                                                                      \
+  do {                                                                                                               \
+    CUresult cu_result = cuda_api;                                                                                   \
+    if (cu_result != CUDA_SUCCESS) {                                                                                 \
+      const char *error_name = NULL;                                                                                 \
+      cuGetErrorName(cu_result, &error_name);                                                                        \
+      std::string message = std::string("ERROR. ") + #cuda_api + " failed with error: " + std::string(error_name);   \
+      LOG(ERROR) << message ;                                                                                        \
+      if (throw_if_error) {                                                                                          \
+        throw std::runtime_error(message);                                                                           \
+      }                                                                                                              \
+    }                                                                                                                \
+  } while (0)
+
+#define OPTIX_API_CALL(optix_api, throw_if_error)                                                                    \
+  do {                                                                                                               \
+    OptixResult optix_result = optix_api;                                                                            \
+    if(optix_result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {                                                        \
+      std::string message = std::string("ERROR. ") + #optix_api + " failed due to installed driver "                 \
+      "does not support ABI version: "  + std::to_string(OPTIX_ABI_VERSION);                                         \
+      LOG(ERROR) << message ;                                                                                        \
+      if(throw_if_error) {                                                                                           \
+        throw std::runtime_error(message);                                                                           \
+      }                                                                                                              \
+    } else if(optix_result != OPTIX_SUCCESS) {                                                                       \
+      std::string message = std::string("ERROR. ") + #optix_api + " failed with error: " +                           \
+        std::to_string(optix_result);                                                                                \
+      LOG(ERROR) << message ;                                                                                        \
+      if(throw_if_error) {                                                                                           \
+        throw std::runtime_error(message);                                                                           \
+      }                                                                                                              \
+    }                                                                                                                \
+  } while (0)
+
+
+} // end of namespace cgr_libcluster
+
+#endif
--- a/intern/cycles/cluster_rendering/libcluster/utils/image.cpp
+++ b/intern/cycles/cluster_rendering/libcluster/utils/image.cpp
@ -0,0 +1,41 @@
+#include<vector>
+
+#include "vector_types.h"    // for uchar3
+
+#include "image.h"
+
+namespace cgr_libcluster {
+
+#define PIXEL_THREAD_COUNT 60
+
+int clip(int n, int lower, int upper) {
+  return std::max(lower, std::min(n, upper));
+}
+
+void yuv2Rgb(uint8_t *yuv_image, int width, int height,
+    std::vector<cgr_libcluster::uchar3> * rgb_image) {
+  const int num_pixels = width * height;
+  for(int i = 0; i < num_pixels; ++i) {
+    const int pixel_row = i/width;
+    const int pixel_column = i - (width * pixel_row);
+    const int uv_row = pixel_row / 2;
+    const int uv_column = pixel_column / 2;
+    const int u_i = num_pixels + uv_row * width + uv_column*2;
+    const int v_i = u_i + 1;
+    const int y = yuv_image[i];
+    const int u = yuv_image[u_i];
+    const int v = yuv_image[v_i];
+    const int c = y - 16;
+    const int d = u - 128;
+    const int e = v - 128;
+    const uint8_t r = clip((298*c + 409*e + 128) >> 8, 0, 255);
+    const uint8_t g = clip((298*c - 100*d - 208*e + 128) >> 8, 0, 255);
+    const uint8_t b = clip((298*c + 516*d + 128) >> 8, 0, 255);
+    uchar3 & rgb_pixel = (*rgb_image)[i];
+    rgb_pixel.x = r;
+    rgb_pixel.y = g;
+    rgb_pixel.z = b;
+  }
+}
+
+}
--- a/intern/cycles/cluster_rendering/libcluster/utils/image.h
+++ b/intern/cycles/cluster_rendering/libcluster/utils/image.h
@ -0,0 +1,12 @@
+#ifndef __IMAGE_H__
+#define __IMAGE_H__
+
+namespace cgr_libcluster {
+
+// This function expects that rgb_image is not null and is properly sized by caller to accomodate all pixels
+void yuv2Rgb(uint8_t *yuv_image, int width, int height,
+    std::vector<cgr_libcluster::uchar3> * rgb_image);
+
+}
+
+#endif 
--- a/intern/cycles/cluster_rendering/libcluster_cuda_kernels/CMakeLists.txt
+++ b/intern/cycles/cluster_rendering/libcluster_cuda_kernels/CMakeLists.txt
@ -0,0 +1,8 @@
+
+file(GLOB SRC *.cpp *.cu)
+
+if(NOT WITH_CYCLES_CUDA_BINARIES)
+  find_package(CUDA)
+endif()
+
+cuda_add_library(cycles_libcluster_cuda_kernels "${LIB}" ${SRC} ${SRC_HEADERS})
--- a/intern/cycles/cluster_rendering/libcluster_cuda_kernels/gpu_image_utils.cu
+++ b/intern/cycles/cluster_rendering/libcluster_cuda_kernels/gpu_image_utils.cu
@ -0,0 +1,91 @@
+
+#include <stdio.h>
+#include <stdint.h>
+
+#include <cuda.h>
+
+#include "gpu_image_utils.h"
+
+constexpr int NUM_COLOR_CHANNELS = 3;
+constexpr int NUM_THREADS = 10240;
+constexpr int NUM_THREADS_PER_BLOCK = 512;
+
+template<typename T>
+struct PixelRgb {
+  T r;
+  T g;
+  T b;
+};
+
+__device__
+float color_linear_to_srgb(float c) {
+  if (c < 0.0031308f)
+    return (c < 0.0f) ? 0.0f : c * 12.92f;
+  else
+    return 1.055f * powf(c, 1.0f / 2.4f) - 0.055f;
+}
+
+// Simple RGB to YUV NV12 (4:2:0 12 bpp) conversion based on a description in this doc:
+// https://learn.microsoft.com/en-us/windows/win32/medfound/recommended-8-bit-yuv-formats-for-video-rendering
+// It only supports resolutions of even numbers to keep it simple. Supporting resolutions with odd numbers
+// adds extra complexity to support edge cases what is unnecessary commplication for not required
+// usecases since we only need to support standard resolutions like
+// 720p, 1080p, 1024x1024, 2048x2048 etc. which are all even numbers.
+// It looks like RGB to YUV conversion is available in the NVIDIA Performance Primitives (npp)
+// https://developer.nvidia.com/npp
+// but we currently do not use npp so I won't introduce this dependency just for single method.
+// If there is a standard or better GPU implementation of RGB-to-YUV conversion available we can switch to it.
+__global__
+void gpuRawRgbToGammaCorrectedYuvNv12Impl(CUdeviceptr cu_image_in_ptr, int width, int height,
+    bool useSrgbColorSpace, CUdeviceptr cu_image_yuv_nv12_out_ptr) {
+  const int num_pixels = width * height;
+  const int num_threads_per_block = blockDim.x;
+  const int num_blocks_in_grid = gridDim.x;
+  const int total_num_of_threads = num_threads_per_block * num_blocks_in_grid;
+  const int num_pixels_to_process_in_thread = (num_pixels + total_num_of_threads) / total_num_of_threads;
+  const int start_i = blockIdx.x * num_threads_per_block * num_pixels_to_process_in_thread +
+    threadIdx.x * num_pixels_to_process_in_thread;
+  const int end_i = min(start_i + num_pixels_to_process_in_thread, num_pixels);
+  const PixelRgb<float> * cu_image_in_rgb_float_ptr = (PixelRgb<float>*)cu_image_in_ptr;
+  uint8_t* cu_image_yuv_nv12_out_uint_ptr = (uint8_t*)cu_image_yuv_nv12_out_ptr;
+  float r_f, g_f, b_f;
+  uint8_t r, g, b;
+  for(int i = start_i; i < end_i; ++i) {
+    // Gamma correction
+    const PixelRgb<float> & raw_pixel = cu_image_in_rgb_float_ptr[i];
+    if(useSrgbColorSpace) {
+      r_f = color_linear_to_srgb(raw_pixel.r);
+      g_f = color_linear_to_srgb(raw_pixel.g);
+      b_f = color_linear_to_srgb(raw_pixel.b);
+    } else {
+      r_f = raw_pixel.r;
+      g_f = raw_pixel.g;
+      b_f = raw_pixel.b;
+    }
+    r = (uint8_t)(__saturatef(r_f) * 255.0f + 0.5f);
+    g = (uint8_t)(__saturatef(g_f) * 255.0f + 0.5f);
+    b = (uint8_t)(__saturatef(b_f) * 255.0f + 0.5f);
+    // Convert sRGB to YUV
+    const int pixel_row = i/width;;
+    const int num_pixels_above_the_current_row = pixel_row * width;
+    const int pixel_column = i - num_pixels_above_the_current_row;
+    const uint8_t y = (( 66 * r + 129 * g + 25 * b + 128) >> 8) +  16;
+    cu_image_yuv_nv12_out_uint_ptr[i] = y;
+    if(pixel_column % 2 == 0 && pixel_row % 2 == 0) {
+      const uint8_t u = ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
+      const uint8_t v = ((112 * r - 94 * g -  18 * b + 128) >> 8) + 128;
+      const int u_i = num_pixels + i - (pixel_row/2 * width);
+      cu_image_yuv_nv12_out_uint_ptr[u_i] = u;
+      cu_image_yuv_nv12_out_uint_ptr[u_i + 1] = v;
+    }
+  }
+}
+
+void gpuRawRgbToGammaCorrectedYuvNv12(CUdeviceptr cu_image_in_ptr, int width, int height,
+    bool useSrgbColorSpace, CUstream cuda_stream, CUdeviceptr cu_image_yuv_nv12_out_ptr) {
+  const int num_blocks = NUM_THREADS / NUM_THREADS_PER_BLOCK;
+  const size_t size_of_dynamically_allocated_shared_memory = 0;
+  gpuRawRgbToGammaCorrectedYuvNv12Impl<<<num_blocks, NUM_THREADS_PER_BLOCK,
+      size_of_dynamically_allocated_shared_memory, cuda_stream>>>(
+    cu_image_in_ptr, width, height, useSrgbColorSpace, cu_image_yuv_nv12_out_ptr);
+}
--- a/intern/cycles/cluster_rendering/libcluster_cuda_kernels/gpu_image_utils.h
+++ b/intern/cycles/cluster_rendering/libcluster_cuda_kernels/gpu_image_utils.h
@ -0,0 +1,7 @@
+#ifndef __IMAGE_UTILS_H__
+#define __IMAGE_UTILS_H__
+
+void gpuRawRgbToGammaCorrectedYuvNv12(CUdeviceptr cu_image_in_ptr, int width, int height,
+  bool useSrgbColorSpace, CUstream cuda_stream, CUdeviceptr cu_image_yuv_nv12_out_ptr);
+
+#endif
--- a/intern/cycles/cluster_rendering/profiling/get_metrics.py
+++ b/intern/cycles/cluster_rendering/profiling/get_metrics.py
@ -979,8 +979,8 @@ def get_resolution_stats(config):
                    print("Can not get resolution from the log line, it could be corrupted. Log line:\n" + log_line, file = sys.stderr)
                    log_line = log_file.readline()
                    continue
-                width = log_items[-3]
-                height = log_items[-2]
+                width = log_items[-2]
+                height = log_items[-1]
                resolution = width + "x" + height
                if current_resolution_stats is None or current_resolution_stats.name != resolution:
                    current_resolution_stats = MetricStats(resolution)
--- a/intern/cycles/cluster_rendering/profiling/init_server.py
+++ b/intern/cycles/cluster_rendering/profiling/init_server.py
@ -16,7 +16,7 @@ from pprint import pprint

 DEFAULT_NUM_GPU = 10

-SUPPORTED_DEVICE_TYPES = ["OPTIX", "CUDA"]
+SUPPORTED_DEVICE_TYPES = ["OPTIX", "CUDA", "METAL"]
 class ProcUnitType(enum.Enum):
    CPU = "CPU"
    GPU = "GPU"
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@ -103,12 +103,16 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  }

  /* Create context. */
-  result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+// FRL_CGR BEGIN
+  // result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+  result = cuDevicePrimaryCtxRetain(&cuContext, cuDevice);

  if (result != CUDA_SUCCESS) {
    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
    return;
  }
+// FRL_CGR END

  int major, minor;
  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@ -116,14 +120,20 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  cuDevArchitecture = major * 100 + minor * 10;

  /* Pop context set by cuCtxCreate. */
-  cuCtxPopCurrent(NULL);
+// FRL_CGR BEGIN
+//  with cuDevicePrimaryCtxRetain do not need to pop current
+//  cuCtxPopCurrent(NULL);
+// FRL_CGR_END
 }

 CUDADevice::~CUDADevice()
 {
  texture_info.free();

-  cuda_assert(cuCtxDestroy(cuContext));
+// FRL_CGR BEGIN
+  // do not destroy primary context
+  //cuda_assert(cuCtxDestroy(cuContext));
+// FRL_CGR END
 }

 bool CUDADevice::support_device(const uint /*kernel_features*/)
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@ -963,7 +963,7 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
  // static thread_mutex mutex;
  // thread_scoped_lock lock(mutex);

-  //const CUDAContextScope scope(this);
+  const CUDAContextScope scope(this);

  //const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
  // FRL_CGR
--- a/intern/cycles/session/session_frl.cpp
+++ b/intern/cycles/session/session_frl.cpp
@ -6,6 +6,7 @@
 CCL_NAMESPACE_BEGIN

 using namespace std::chrono;
+using cgr_libcluster::ClusterSessionParams;

 ccl::float3 &toCyclesFloat3(cgr_libcluster::float3 &value);
 cgr_libcluster::float3 &fromCyclesFloat3(ccl::float3 &value);
@ -82,6 +83,7 @@ void Session::logRenderSessionPanelSettings() {
    "  num servers: "         << params.cluster_session_params.num_servers                 << std::endl <<
    "  compression quality: " << params.cluster_session_params.master_compression_quality  << std::endl <<
    "  master_image_color_format: " << params.cluster_session_params.master_image_color_format  << std::endl <<
+    "  master_image_compressor: " << params.cluster_session_params.master_image_compressor  << std::endl <<
    "  master denoiser: "     << params.cluster_session_params.master_denoiser;
 }

@ -305,7 +307,8 @@ bool Session::server_wait_for_camera()
 //-------Methods used by client and standalone session to create NetCamera objects, serialize and transport (client) them
 //-------------------------------------------------------------------------------------------------------------------------
 //used by client session to initliaze a NetCamera for transport
-static void initializeNetCamera(cgr_libcluster::NetCamera& netCamera, Camera &cam, int s, int f, int sceneFrame, int iseed, cgr_libcluster::ClusterSessionParams& csp)
+static void initializeNetCamera(cgr_libcluster::NetCamera& netCamera, Camera &cam, int s, int f, int sceneFrame, int iseed,
+    cgr_libcluster::ClusterSessionParams& csp)
 {
  netCamera.cam_matrix = fromCyclesTransform(cam.get_matrix());
  netCamera.cam_type = fromCyclesCameraType(cam.get_camera_type());
@ -329,11 +332,29 @@ static void initializeNetCamera(cgr_libcluster::NetCamera& netCamera, Camera &ca

  netCamera.master_denoiser = csp.master_denoiser;
  netCamera.master_image_color_format = csp.master_image_color_format;
+  netCamera.master_image_compressor = csp.master_image_compressor;
  VLOG(3) << "Constructed net camera: " << netCamera.cam_width << " " << netCamera.cam_height;
 }

 void Session::client_send_camera(int samples)
 {
+#ifdef WITH_CUDA
+  if (theClient &&
+      params.cluster_session_params.master_image_compressor == ClusterSessionParams::MASTER_IMAGE_COMPRESSOR_NVENCODER &&
+      (scene->camera->get_full_width() % 2 || scene->camera->get_full_height() % 2)) {
+    std::string message = "NVENCODER compressor is requested for images with odd dimension: " +
+      std::to_string(scene->camera->get_full_width()) + "x" + std::to_string(scene->camera->get_full_height()) +
+      " Current implementation of NVENCODER only supports images with even dimensions.\
+ To use NVENCODER compressor please resize image so it has even dimension like 1280x720";
+      throw std::runtime_error(message);
+  }
+#else
+  if(params.cluster_session_params.master_image_compressor == ClusterSessionParams::MASTER_IMAGE_COMPRESSOR_NVENCODER) {
+    throw std::runtime_error("ERROR. NVENCODER compressor is requested. Client is compiled without CUDA support\
+ so has no nvencoder and will not be able to decode received images which are nvencoded.\
+ Recompile with CUDA or use JPEG compressor instead. Terminating.");
+  }
+#endif
  //TODO Modify object msg need to be handled properly
  if (theClient) {
    cgr_libcluster::ModifyObjectParams & modify_object_params = params.cluster_session_params.modify_object_params;
@ -349,7 +370,6 @@ void Session::client_send_camera(int samples)
    int scene_frame = scene->getCurrentFrame();
    initializeNetCamera(netCamera, *(scene->camera), samples, frame_count, scene_frame, iseed, params.cluster_session_params);
    if (theClient) {
-
      //client send  NetCamera object to master
      theClient->send_camera(netCamera);

--- a/source/blender/makesdna/CMakeLists.txt
+++ b/source/blender/makesdna/CMakeLists.txt
@ -5,4 +5,16 @@ if(WITH_FREESTYLE)
  add_definitions(-DWITH_FREESTYLE)
 endif()

+#FRL_CLR_BEGIN
+
+if(WITH_CYCLES_DEVICE_CUDA)
+  add_definitions(-DWITH_CUDA)
+endif()
+
+if(WITH_CYCLES_DEVICE_OPTIX)
+  add_definitions(-DWITH_OPTIX)
+endif()
+
+#FRL_CLR_END
+
 add_subdirectory(intern)
--- a/source/blender/makesdna/DNA_camera_types.h
+++ b/source/blender/makesdna/DNA_camera_types.h
@ -146,6 +146,7 @@ enum {
  CAM_SHOWSENSOR = (1 << 8),
  CAM_SHOW_SAFE_CENTER = (1 << 9),
  CAM_SHOW_BG_IMAGE = (1 << 10),
+  CAM_SHOW_KINECT_AZURE = (1 << 11), // Custom FB
 };

 /* Sensor fit */
--- a/source/blender/makesdna/DNA_node_types.h
+++ b/source/blender/makesdna/DNA_node_types.h
@ -1657,6 +1657,7 @@ typedef struct NodeShaderMix {
 #define SHD_GLOSSY_GGX 2
 #define SHD_GLOSSY_ASHIKHMIN_SHIRLEY 3
 #define SHD_GLOSSY_MULTI_GGX 4
+#define SHD_GLOSSY_GGX_FRESNEL_REFRACTION 5

 /* vector transform */
 #define SHD_VECT_TRANSFORM_TYPE_VECTOR 0
--- a/source/blender/makesdna/DNA_scene_types.h
+++ b/source/blender/makesdna/DNA_scene_types.h
@ -850,7 +850,8 @@ typedef struct RenderData {
  int schedule_modify_object_message;
  int device_scale_factor;
  int master_image_color_format;
-  char _pad_frl_fields[4]; // add padding if needed to satify aligning check (by 8 bytes)
+  int master_image_compressor;
+  //char _pad_frl_fields[4]; // add padding if needed to satify aligning check (by 8 bytes)

  /* WebRTC related */
  int use_webrtc;
@ -898,7 +899,9 @@ typedef enum {
 typedef enum {
  MASTER_DENOISER_NONE = 0,
  MASTER_DENOISER_OIDN = 1,
+#ifdef WITH_OPTIX
  MASTER_DENOISER_OPTIX = 2,
+#endif //WITH_OPTIX
  MASTER_DENOISER_BARCELONA = 3,
 } eMasterDenoiser;

@ -908,6 +911,14 @@ typedef enum {
  MASTER_IMAGE_COLOR_FORMAT_SRGB = 2,
 } eMasterImageColorFormat;

+/* RenderData.master_image_compressor */
+typedef enum {
+  MASTER_IMAGE_COMPRESSOR_JPEG = 1,
+#ifdef WITH_CUDA
+  MASTER_IMAGE_COMPRESSOR_NVENCODER = 2,
+#endif //WITH_CUDA
+} eMasterImageCompressor;
+
 /* RenderData.peer_connection_protocol */
 typedef enum {
  PEER_CONNECTION_PROTOCOL_ANY = 0,
--- a/source/blender/makesdna/intern/CMakeLists.txt
+++ b/source/blender/makesdna/intern/CMakeLists.txt
@ -20,6 +20,17 @@ set(LIB
 )

 add_definitions(-DWITH_DNA_GHASH)
+#FRL_CLR_BEGIN
+
+#if(WITH_CYCLES_DEVICE_CUDA)
+#  add_definitions(-DWITH_CUDA)
+#endif()
+
+if(WITH_CYCLES_DEVICE_OPTIX)
+  add_definitions(-DWITH_OPTIX)
+endif()
+
+#FRL_CLR_END

 # Needed for `mallocn.c`.
 if(HAVE_MALLOC_STATS_H)
--- a/source/blender/makesrna/intern/CMakeLists.txt
+++ b/source/blender/makesrna/intern/CMakeLists.txt
@ -241,6 +241,18 @@ if(WITH_PYTHON)
  )
 endif()

+#FRL_CLR_BEGIN
+
+  if(WITH_CYCLES_DEVICE_CUDA)
+    add_definitions(-DWITH_CUDA)
+  endif()
+
+  if(WITH_CYCLES_DEVICE_OPTIX)
+    add_definitions(-DWITH_OPTIX)
+  endif()
+
+#FRL_CLR_END
+
 if(WITH_IMAGE_OPENEXR)
  add_definitions(-DWITH_OPENEXR)
 endif()
--- a/source/blender/makesrna/intern/rna_scene.c
+++ b/source/blender/makesrna/intern/rna_scene.c
@ -6283,11 +6283,13 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
       0,
       "OpenImageDenoise",
          "Master denoises images with Intel Open Image Denoise"},
+#ifdef WITH_OPTX
      {MASTER_DENOISER_OPTIX,
       "OPTIX",
       0,
       "OptiX",
       "Master denoises images with NVIDIA OptiX AI-Accelerated denoiser"},
+#endif
       {MASTER_DENOISER_BARCELONA,
       "BARCELONA",
       0,
@ -6310,6 +6312,22 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
      {0, NULL, 0, NULL, NULL},
  };

+  static const EnumPropertyItem master_image_compressor_items[] = {
+      {MASTER_IMAGE_COMPRESSOR_JPEG,
+       "JPEG",
+       0,
+       "JPEG",
+       "Master compresses images with JPEG"},
+#ifdef WITH_CUDA
+      {MASTER_IMAGE_COMPRESSOR_NVENCODER,
+       "NVENCODER",
+       0,
+       "NVENCODER",
+       "Master compresses images with NVENCODER."},
+#endif //WITH_CUDA
+      {0, NULL, 0, NULL, NULL},
+  };
+
  static const EnumPropertyItem render_session_mode_items[] = {
      {RENDER_SESSION_MODE_STANDALONE,
       "STANDALONE",
@ -6455,6 +6473,12 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
  RNA_def_property_ui_text(prop, "Color format", "");
  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);

+  prop = RNA_def_property(srna, "master_image_compressor", PROP_ENUM, PROP_NONE);
+  RNA_def_property_enum_items(prop, master_image_compressor_items);
+  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
+  RNA_def_property_ui_text(prop, "Image compressor", "");
+  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
+
  // Modify object name section
  prop = RNA_def_property(srna, "modify_object_name", PROP_STRING, PROP_NONE);
  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
--- a/source/creator/CMakeLists.txt
+++ b/source/creator/CMakeLists.txt
@ -111,6 +111,18 @@ if(WITH_OPENCOLORIO)
  add_definitions(-DWITH_OCIO)
 endif()

+#FRL_CLR_BEGIN
+
+if(WITH_CYCLES_DEVICE_CUDA)
+  add_definitions(-DWITH_CUDA)
+endif()
+
+if(WITH_CYCLES_DEVICE_OPTIX)
+  add_definitions(-DWITH_OPTIX)
+endif()
+
+#FRL_CLR_END
+
 # Setup the EXE sources and `buildinfo`.
 set(SRC
  creator.c
--- a/source/creator/creator_args.c
+++ b/source/creator/creator_args.c
@ -2020,9 +2020,13 @@ static int arg_handle_master_denoiser(int argc, const char **argv, void *data)
      const char *rmtype = argv[1];
      if (BLI_strcasecmp(rmtype, "OIDN") == 0) {
        scene->r.master_denoiser = MASTER_DENOISER_OIDN;
-      } else if (BLI_strcasecmp(rmtype, "OPTIX") == 0) {
+      }
+#ifdef WITH_OPTIX
+      else if (BLI_strcasecmp(rmtype, "OPTIX") == 0) {
        scene->r.master_denoiser = MASTER_DENOISER_OPTIX;
-      } else if (BLI_strcasecmp(rmtype, "BARCELONA") == 0) {
+      }
+#endif
+      else if (BLI_strcasecmp(rmtype, "BARCELONA") == 0) {
        scene->r.master_denoiser = MASTER_DENOISER_BARCELONA;
      } else {
        printf("\nError: Unknown master denoiser %s (--master-denoiser <OIDN> or <OPTIX> or <BARCELONA>).\n", rmtype);
@ -2064,11 +2068,48 @@ static int arg_handle_master_image_color_format(int argc, const char **argv, voi
  else {
    printf(
        "\nError: no blend loaded. "
-        "order the arguments so '--master_image_color_format' is after the blend is loaded.\n");
+        "order the arguments so '--master-image-color-format' is after the blend is loaded.\n");
    return 0;
  }
 }

+static const char arg_handle_master_image_compressor_doc[] =
+    "<master_image_compression>\n"
+    "\tCompressor that master uses to compress images before sending them to a client.\n"
+    "\tSupported values: JPEG, NVENCODER\n";
+
+static int arg_handle_master_image_compressor(int argc, const char **argv, void *data)
+{
+  bContext *C = data;
+  Scene *scene = CTX_data_scene(C);
+
+  scene->r.master_image_color_format = MASTER_IMAGE_COMPRESSOR_JPEG;
+  if (scene) {
+    if (argc > 1) {
+      const char *rmtype = argv[1];
+      if (BLI_strcasecmp(rmtype, "JPEG") == 0) {
+        scene->r.master_image_color_format = MASTER_IMAGE_COMPRESSOR_JPEG;
+      }
+#ifdef WITH_CUDA
+      else if (BLI_strcasecmp(rmtype, "NVENCODER") == 0) {
+        scene->r.master_image_color_format = MASTER_IMAGE_COMPRESSOR_NVENCODER;
+      }
+#endif //WITH_CUDA
+      else {
+        printf("\nError: Unknown compressor %s (--master-image-compressor <JPEG> or <NVENCODER>).\n", rmtype);
+      }
+    }
+    return 1;
+  }
+  else {
+    printf(
+        "\nError: no blend loaded. "
+        "order the arguments so '--master-image-compressor' is after the blend is loaded.\n");
+    return 0;
+  }
+}
+
+
 #ifdef WITH_WEBRTC

 static int arg_handle_webrtc_int_param(int argc, const char **argv, const char *arg_id, int *param, int default_param_value)
@ -3049,6 +3090,7 @@ void main_args_setup(bContext *C, bArgs *ba)
  BLI_args_add(ba, NULL, "--save_denoise_io", CB(arg_handle_save_denoise_io), C);
  BLI_args_add(ba, NULL, "--save_cameras", CB(arg_handle_save_cameras), C);
  BLI_args_add(ba, NULL, "--master-image-color-format", CB(arg_handle_master_image_color_format), C);
+  BLI_args_add(ba, NULL, "--master-image-compressor", CB(arg_handle_master_image_compressor), C);
 #ifdef WITH_WEBRTC
  BLI_args_add(ba, NULL, "--use-webrtc", CB(arg_handle_use_webrtc), C);
  BLI_args_add(ba, NULL, "--signaling-server-address", CB(arg_handle_signaling_server_address), C);