diff --git a/intern/gawain/CMakeLists.txt b/intern/gawain/CMakeLists.txt
index 9924daa8cd1..424b364ae8e 100644
--- a/intern/gawain/CMakeLists.txt
+++ b/intern/gawain/CMakeLists.txt
@@ -16,6 +16,7 @@ set(SRC
 	src/gwn_imm_util.c
 	src/gwn_primitive.c
 	src/gwn_shader_interface.c
+	src/gwn_vertex_array_id.cpp
 	src/gwn_vertex_buffer.c
 	src/gwn_vertex_format.c
 
@@ -30,6 +31,7 @@ set(SRC
 	gawain/gwn_primitive.h
 	gawain/gwn_primitive_private.h
 	gawain/gwn_shader_interface.h
+	gawain/gwn_vertex_array_id.h
 	gawain/gwn_vertex_buffer.h
 	gawain/gwn_vertex_format.h
 	gawain/gwn_vertex_format_private.h
diff --git a/intern/gawain/gawain/gwn_batch.h b/intern/gawain/gawain/gwn_batch.h
index 94cd893f09e..c676cfef119 100644
--- a/intern/gawain/gawain/gwn_batch.h
+++ b/intern/gawain/gawain/gwn_batch.h
@@ -23,34 +23,61 @@ typedef enum {
 } Gwn_BatchPhase;
 
 #define GWN_BATCH_VBO_MAX_LEN 3
+#define GWN_BATCH_VAO_STATIC_LEN 3
+#define GWN_BATCH_VAO_DYN_ALLOC_COUNT 16
 
 typedef struct Gwn_Batch {
 	// geometry
 	Gwn_VertBuf* verts[GWN_BATCH_VBO_MAX_LEN]; // verts[0] is required, others can be NULL
+	Gwn_VertBuf* inst; // instance attribs
 	Gwn_IndexBuf* elem; // NULL if element list not needed
-	Gwn_PrimType prim_type;
 	GLenum gl_prim_type;
 
-	// book-keeping
-	GLuint vao_id; // remembers all geometry state (vertex attrib bindings & element buffer)
-	Gwn_BatchPhase phase;
-	bool program_dirty;
-	bool program_in_use;
-	unsigned owns_flag;
-
-	// state
+	// cached values (avoid dereferencing later)
+	GLuint vao_id;
 	GLuint program;
-	const Gwn_ShaderInterface* interface;
+	const struct Gwn_ShaderInterface* interface;
+
+	// book-keeping
+	unsigned owns_flag;
+	struct Gwn_Context *context; // used to free all vaos. this implies all vaos were created under the same context.
+	Gwn_BatchPhase phase;
+	bool program_in_use;
+
+	// Vao management: remembers all geometry state (vertex attrib bindings & element buffer)
+	// for each shader interface. Start with a static number of vaos and fallback to dynamic count
+	// if necessary. Once a batch goes dynamic it does not go back.
+	bool is_dynamic_vao_count;
+	union {
+		// Static handle count
+		struct {
+			const struct Gwn_ShaderInterface* interfaces[GWN_BATCH_VAO_STATIC_LEN];
+			GLuint vao_ids[GWN_BATCH_VAO_STATIC_LEN];
+		} static_vaos;
+		// Dynamic handle count
+		struct {
+			unsigned count;
+			const struct Gwn_ShaderInterface** interfaces;
+			GLuint* vao_ids;
+		} dynamic_vaos;
+	};
+
+	// XXX This is the only solution if we want to have some data structure using
+	// batches as key to identify nodes. We must destroy these nodes with this callback.
+	void (*free_callback)(struct Gwn_Batch*, void*);
+	void* callback_data;
 } Gwn_Batch;
 
 enum {
 	GWN_BATCH_OWNS_VBO = (1 << 0),
 	/* each vbo index gets bit-shifted */
+	GWN_BATCH_OWNS_INSTANCES = (1 << 30),
 	GWN_BATCH_OWNS_INDEX = (1 << 31),
 };
 
 Gwn_Batch* GWN_batch_create_ex(Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, unsigned owns_flag);
 void GWN_batch_init_ex(Gwn_Batch*, Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, unsigned owns_flag);
+Gwn_Batch* GWN_batch_duplicate(Gwn_Batch* batch_src);
 
 #define GWN_batch_create(prim, verts, elem) \
 	GWN_batch_create_ex(prim, verts, elem, 0)
@@ -59,11 +86,18 @@ void GWN_batch_init_ex(Gwn_Batch*, Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, un
 
 void GWN_batch_discard(Gwn_Batch*); // verts & elem are not discarded
 
+void GWN_batch_callback_free_set(Gwn_Batch*, void (*callback)(Gwn_Batch*, void*), void*);
+
+void GWN_batch_instbuf_set(Gwn_Batch*, Gwn_VertBuf*, bool own_vbo); // Instancing
+
 int GWN_batch_vertbuf_add_ex(Gwn_Batch*, Gwn_VertBuf*, bool own_vbo);
 
 #define GWN_batch_vertbuf_add(batch, verts) \
 	GWN_batch_vertbuf_add_ex(batch, verts, false)
 
+// This is a private function
+void GWN_batch_remove_interface_ref(Gwn_Batch*, const Gwn_ShaderInterface*);
+
 void GWN_batch_program_set(Gwn_Batch*, GLuint program, const Gwn_ShaderInterface*);
 void GWN_batch_program_unset(Gwn_Batch*);
 // Entire batch draws with one shader program, but can be redrawn later with another program.
@@ -84,11 +118,14 @@ void GWN_batch_uniform_4fv(Gwn_Batch*, const char* name, const float data[4]);
 
 void GWN_batch_draw(Gwn_Batch*);
 
+// This does not bind/unbind shader and does not call gpuBindMatrices()
+void GWN_batch_draw_range_ex(Gwn_Batch*, int v_first, int v_count, bool force_instance);
 
-void GWN_batch_draw_stupid(Gwn_Batch*, int v_first, int v_count);
-void GWN_batch_draw_stupid_instanced(Gwn_Batch*, Gwn_Batch*, int instance_first, int instance_count);
-void GWN_batch_draw_procedural(Gwn_Batch*, Gwn_PrimType, int v_count);
+#define GWN_batch_draw_range(batch, first, count) \
+	GWN_batch_draw_range_ex(batch, first, count, false)
 
+// Does not even need batch
+void GWN_draw_primitive(Gwn_PrimType, int v_count);
 
 #if 0 // future plans
 
diff --git a/intern/gawain/gawain/gwn_buffer_id.h b/intern/gawain/gawain/gwn_buffer_id.h
index db5df99f526..6f51ca6905d 100644
--- a/intern/gawain/gawain/gwn_buffer_id.h
+++ b/intern/gawain/gawain/gwn_buffer_id.h
@@ -25,10 +25,6 @@ extern "C" {
 GLuint GWN_buf_id_alloc(void);
 void GWN_buf_id_free(GLuint buffer_id);
 
-GLuint GWN_vao_alloc(void);
-void GWN_vao_free(GLuint vao_id);
-
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/intern/gawain/gawain/gwn_shader_interface.h b/intern/gawain/gawain/gwn_shader_interface.h
index 345ad8d389b..3bca541d6e8 100644
--- a/intern/gawain/gawain/gwn_shader_interface.h
+++ b/intern/gawain/gawain/gwn_shader_interface.h
@@ -54,6 +54,7 @@ typedef struct Gwn_ShaderInput {
 } Gwn_ShaderInput;
 
 #define GWN_NUM_SHADERINTERFACE_BUCKETS 257
+#define GWN_SHADERINTERFACE_REF_ALLOC_COUNT 16
 
 typedef struct Gwn_ShaderInterface {
 	GLint program;
@@ -63,6 +64,8 @@ typedef struct Gwn_ShaderInterface {
 	Gwn_ShaderInput* ubo_buckets[GWN_NUM_SHADERINTERFACE_BUCKETS];
 	Gwn_ShaderInput* builtin_uniforms[GWN_NUM_UNIFORMS];
 	char* name_buffer;
+	struct Gwn_Batch** batches; // references to batches using this interface
+	unsigned batches_ct;
 } Gwn_ShaderInterface;
 
 Gwn_ShaderInterface* GWN_shaderinterface_create(GLint program_id);
@@ -72,3 +75,7 @@ const Gwn_ShaderInput* GWN_shaderinterface_uniform(const Gwn_ShaderInterface*, c
 const Gwn_ShaderInput* GWN_shaderinterface_uniform_builtin(const Gwn_ShaderInterface*, Gwn_UniformBuiltin);
 const Gwn_ShaderInput* GWN_shaderinterface_ubo(const Gwn_ShaderInterface*, const char* name);
 const Gwn_ShaderInput* GWN_shaderinterface_attr(const Gwn_ShaderInterface*, const char* name);
+
+// keep track of batches using this interface
+void GWN_shaderinterface_add_batch_ref(Gwn_ShaderInterface*, struct Gwn_Batch*);
+void GWN_shaderinterface_remove_batch_ref(Gwn_ShaderInterface*, struct Gwn_Batch*);
diff --git a/intern/gawain/gawain/gwn_vertex_array_id.h b/intern/gawain/gawain/gwn_vertex_array_id.h
index 6d2a059b9bd..1c093d428ce 100644
--- a/intern/gawain/gawain/gwn_vertex_array_id.h
+++ b/intern/gawain/gawain/gwn_vertex_array_id.h
@@ -26,8 +26,8 @@ extern "C" {
 #include "gwn_context.h"
 
 GLuint GWN_vao_default(void);
-GLuint GWN_vao_alloc_new(void);
-void GWN_vao_free_new(GLuint vao_id, Gwn_Context*);
+GLuint GWN_vao_alloc(void);
+void GWN_vao_free(GLuint vao_id, Gwn_Context*);
 
 #ifdef __cplusplus
 }
diff --git a/intern/gawain/src/gwn_batch.c b/intern/gawain/src/gwn_batch.c
index ec3f98e348c..098c547c662 100644
--- a/intern/gawain/src/gwn_batch.c
+++ b/intern/gawain/src/gwn_batch.c
@@ -11,12 +11,48 @@
 
 #include "gwn_batch.h"
 #include "gwn_buffer_id.h"
+#include "gwn_vertex_array_id.h"
 #include "gwn_primitive_private.h"
 #include <stdlib.h>
+#include <string.h>
 
 // necessary functions from matrix API
 extern void gpuBindMatrices(const Gwn_ShaderInterface* shaderface);
-extern bool gpuMatricesDirty(void); // how best to use this here?
+
+static void batch_update_program_bindings(Gwn_Batch* batch, unsigned int v_first);
+
+static void Batch_vao_cache_clear(Gwn_Batch* batch)
+	{
+	if (batch->is_dynamic_vao_count)
+		{
+		for (int i = 0; i < batch->dynamic_vaos.count; ++i)
+			{
+			if (batch->dynamic_vaos.vao_ids[i])
+				GWN_vao_free(batch->dynamic_vaos.vao_ids[i], batch->context);
+			if (batch->dynamic_vaos.interfaces[i])
+				GWN_shaderinterface_remove_batch_ref((Gwn_ShaderInterface *)batch->dynamic_vaos.interfaces[i], batch);
+			}
+		free(batch->dynamic_vaos.interfaces);
+		free(batch->dynamic_vaos.vao_ids);
+		}
+	else
+		{
+		for (int i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+			{
+			if (batch->static_vaos.vao_ids[i])
+				GWN_vao_free(batch->static_vaos.vao_ids[i], batch->context);
+			if (batch->static_vaos.interfaces[i])
+				GWN_shaderinterface_remove_batch_ref((Gwn_ShaderInterface *)batch->static_vaos.interfaces[i], batch);
+			}
+		}
+
+	batch->is_dynamic_vao_count = false;
+	for (int i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+		{
+		batch->static_vaos.vao_ids[i] = 0;
+		batch->static_vaos.interfaces[i] = NULL;
+		}
+	}
 
 Gwn_Batch* GWN_batch_create_ex(
         Gwn_PrimType prim_type, Gwn_VertBuf* verts, Gwn_IndexBuf* elem,
@@ -40,11 +76,25 @@ void GWN_batch_init_ex(
 	batch->verts[0] = verts;
 	for (int v = 1; v < GWN_BATCH_VBO_MAX_LEN; ++v)
 		batch->verts[v] = NULL;
+	batch->inst = NULL;
 	batch->elem = elem;
-	batch->prim_type = prim_type;
 	batch->gl_prim_type = convert_prim_type_to_gl(prim_type);
 	batch->phase = GWN_BATCH_READY_TO_DRAW;
+	batch->is_dynamic_vao_count = false;
 	batch->owns_flag = owns_flag;
+	batch->free_callback = NULL;
+	}
+
+// This will share the VBOs with the new batch
+Gwn_Batch* GWN_batch_duplicate(Gwn_Batch* batch_src)
+	{
+	Gwn_Batch* batch = GWN_batch_create_ex(GWN_PRIM_POINTS, batch_src->verts[0], batch_src->elem, 0);
+
+	batch->gl_prim_type = batch_src->gl_prim_type;
+	for (int v = 1; v < GWN_BATCH_VBO_MAX_LEN; ++v)
+		batch->verts[v] = batch_src->verts[v];
+
+	return batch;
 	}
 
 void GWN_batch_discard(Gwn_Batch* batch)
@@ -52,6 +102,9 @@ void GWN_batch_discard(Gwn_Batch* batch)
 	if (batch->owns_flag & GWN_BATCH_OWNS_INDEX)
 		GWN_indexbuf_discard(batch->elem);
 
+	if (batch->owns_flag & GWN_BATCH_OWNS_INSTANCES)
+		GWN_vertbuf_discard(batch->inst);
+
 	if ((batch->owns_flag & ~GWN_BATCH_OWNS_INDEX) != 0)
 		{
 		for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN; ++v)
@@ -63,12 +116,39 @@ void GWN_batch_discard(Gwn_Batch* batch)
 			}
 		}
 
-	if (batch->vao_id)
-		GWN_vao_free(batch->vao_id);
+	Batch_vao_cache_clear(batch);
+
+	if (batch->free_callback)
+		batch->free_callback(batch, batch->callback_data);
 
 	free(batch);
 	}
 
+void GWN_batch_callback_free_set(Gwn_Batch* batch, void (*callback)(Gwn_Batch*, void*), void* user_data)
+	{
+	batch->free_callback = callback;
+	batch->callback_data = user_data;
+	}
+
+void GWN_batch_instbuf_set(Gwn_Batch* batch, Gwn_VertBuf* inst, bool own_vbo)
+	{
+#if TRUST_NO_ONE
+	assert(inst != NULL);
+#endif
+	// redo the bindings
+	Batch_vao_cache_clear(batch);
+
+	if (batch->inst != NULL && (batch->owns_flag & GWN_BATCH_OWNS_INSTANCES))
+		GWN_vertbuf_discard(batch->inst);
+
+	batch->inst = inst;
+
+	if (own_vbo)
+		batch->owns_flag |= GWN_BATCH_OWNS_INSTANCES;
+	else
+		batch->owns_flag &= ~GWN_BATCH_OWNS_INSTANCES;
+	}
+
 int GWN_batch_vertbuf_add_ex(
         Gwn_Batch* batch, Gwn_VertBuf* verts,
         bool own_vbo)
@@ -100,12 +180,96 @@ int GWN_batch_vertbuf_add_ex(
 void GWN_batch_program_set(Gwn_Batch* batch, GLuint program, const Gwn_ShaderInterface* shaderface)
 	{
 #if TRUST_NO_ONE
-	assert(glIsProgram(program));
+	assert(glIsProgram(shaderface->program));
+	assert(batch->program_in_use == 0);
 #endif
 
+	batch->vao_id = 0;
 	batch->program = program;
 	batch->interface = shaderface;
-	batch->program_dirty = true;
+
+
+	// Search through cache
+	if (batch->is_dynamic_vao_count)
+		{
+		for (int i = 0; i < batch->dynamic_vaos.count && batch->vao_id == 0; ++i)
+			if (batch->dynamic_vaos.interfaces[i] == shaderface)
+				batch->vao_id = batch->dynamic_vaos.vao_ids[i];
+		}
+	else
+		{
+		for (int i = 0; i < GWN_BATCH_VAO_STATIC_LEN && batch->vao_id == 0; ++i)
+			if (batch->static_vaos.interfaces[i] == shaderface)
+				batch->vao_id = batch->static_vaos.vao_ids[i];
+		}
+
+	if (batch->vao_id == 0)
+		{
+		if (batch->context == NULL)
+			batch->context = GWN_context_active_get();
+#if TRUST_NO_ONE && 0 // disabled until we use a separate single context for UI.
+		else // Make sure you are not trying to draw this batch in another context.
+			assert(batch->context == GWN_context_active_get());
+#endif
+		// Cache miss, time to add a new entry!
+		if (!batch->is_dynamic_vao_count)
+			{
+			int i; // find first unused slot
+			for (i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+				if (batch->static_vaos.vao_ids[i] == 0)
+					break;
+
+			if (i < GWN_BATCH_VAO_STATIC_LEN)
+				{
+				batch->static_vaos.interfaces[i] = shaderface;
+				batch->static_vaos.vao_ids[i] = batch->vao_id = GWN_vao_alloc();
+				}
+			else
+				{
+				// Not enough place switch to dynamic.
+				batch->is_dynamic_vao_count = true;
+				// Erase previous entries, they will be added back if drawn again.
+				for (int j = 0; j < GWN_BATCH_VAO_STATIC_LEN; ++j)
+					{
+					GWN_shaderinterface_remove_batch_ref((Gwn_ShaderInterface*)batch->static_vaos.interfaces[j], batch);
+					GWN_vao_free(batch->static_vaos.vao_ids[j], batch->context);
+					}
+				// Init dynamic arrays and let the branch below set the values.
+				batch->dynamic_vaos.count = GWN_BATCH_VAO_DYN_ALLOC_COUNT;
+				batch->dynamic_vaos.interfaces = calloc(batch->dynamic_vaos.count, sizeof(Gwn_ShaderInterface*));
+				batch->dynamic_vaos.vao_ids = calloc(batch->dynamic_vaos.count, sizeof(GLuint));
+				}
+			}
+
+		if (batch->is_dynamic_vao_count)
+			{
+			int i; // find first unused slot
+			for (i = 0; i < batch->dynamic_vaos.count; ++i)
+				if (batch->dynamic_vaos.vao_ids[i] == 0)
+					break;
+
+			if (i == batch->dynamic_vaos.count)
+				{
+				// Not enough place, realloc the array.
+				i = batch->dynamic_vaos.count;
+				batch->dynamic_vaos.count += GWN_BATCH_VAO_DYN_ALLOC_COUNT;
+				batch->dynamic_vaos.interfaces = realloc(batch->dynamic_vaos.interfaces, sizeof(Gwn_ShaderInterface*) * batch->dynamic_vaos.count);
+				batch->dynamic_vaos.vao_ids = realloc(batch->dynamic_vaos.vao_ids, sizeof(GLuint) * batch->dynamic_vaos.count);
+				memset(batch->dynamic_vaos.interfaces + i, 0, sizeof(Gwn_ShaderInterface*) * GWN_BATCH_VAO_DYN_ALLOC_COUNT);
+				memset(batch->dynamic_vaos.vao_ids + i, 0, sizeof(GLuint) * GWN_BATCH_VAO_DYN_ALLOC_COUNT);
+				}
+
+			batch->dynamic_vaos.interfaces[i] = shaderface;
+			batch->dynamic_vaos.vao_ids[i] = batch->vao_id = GWN_vao_alloc();
+			}
+
+		GWN_shaderinterface_add_batch_ref((Gwn_ShaderInterface*)shaderface, batch);
+
+		// We just got a fresh VAO we need to initialize it.
+		glBindVertexArray(batch->vao_id);
+		batch_update_program_bindings(batch, 0);
+		glBindVertexArray(0);
+		}
 
 	GWN_batch_program_use_begin(batch); // hack! to make Batch_Uniform* simpler
 	}
@@ -118,94 +282,104 @@ void GWN_batch_program_unset(Gwn_Batch* batch)
 	batch->program_in_use = false;
 	}
 
-static void create_bindings(Gwn_Batch* batch, const Gwn_ShaderInterface* interface, unsigned int v_first, const bool use_instancing)
+void GWN_batch_remove_interface_ref(Gwn_Batch* batch, const Gwn_ShaderInterface* interface)
 	{
-	for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN; ++v)
+	if (batch->is_dynamic_vao_count)
 		{
-		Gwn_VertBuf* verts = batch->verts[v];
-		if (verts == NULL)
-			break;
-
-		const Gwn_VertFormat* format = &verts->format;
-
-		const unsigned attrib_ct = format->attrib_ct;
-		const unsigned stride = format->stride;
-
-		GWN_vertbuf_use(verts);
-
-		for (unsigned a_idx = 0; a_idx < attrib_ct; ++a_idx)
+		for (int i = 0; i < batch->dynamic_vaos.count; ++i)
 			{
-			const Gwn_VertAttr* a = format->attribs + a_idx;
-
-			const GLvoid* pointer = (const GLubyte*)0 + a->offset + v_first * stride;
-
-			for (unsigned n_idx = 0; n_idx < a->name_ct; ++n_idx)
+			if (batch->dynamic_vaos.interfaces[i] == interface)
 				{
-				const Gwn_ShaderInput* input = GWN_shaderinterface_attr(interface, a->name[n_idx]);
+				GWN_vao_free(batch->dynamic_vaos.vao_ids[i], batch->context);
+				batch->dynamic_vaos.vao_ids[i] = 0;
+				batch->dynamic_vaos.interfaces[i] = NULL;
+				break; // cannot have duplicates
+				}
+			}
+		}
+	else
+		{
+		int i;
+		for (i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+			{
+			if (batch->static_vaos.interfaces[i] == interface)
+				{
+				GWN_vao_free(batch->static_vaos.vao_ids[i], batch->context);
+				batch->static_vaos.vao_ids[i] = 0;
+				batch->static_vaos.interfaces[i] = NULL;
+				break; // cannot have duplicates
+				}
+			}
+		}
+	}
 
-				if (input == NULL) continue;
+static void create_bindings(Gwn_VertBuf* verts, const Gwn_ShaderInterface* interface, unsigned int v_first, const bool use_instancing)
+	{
+	const Gwn_VertFormat* format = &verts->format;
 
-				if (a->comp_ct == 16 || a->comp_ct == 12 || a->comp_ct == 8)
-					{
+	const unsigned attrib_ct = format->attrib_ct;
+	const unsigned stride = format->stride;
+
+	GWN_vertbuf_use(verts);
+
+	for (unsigned a_idx = 0; a_idx < attrib_ct; ++a_idx)
+		{
+		const Gwn_VertAttr* a = format->attribs + a_idx;
+
+		const GLvoid* pointer = (const GLubyte*)0 + a->offset + v_first * stride;
+
+		for (unsigned n_idx = 0; n_idx < a->name_ct; ++n_idx)
+			{
+			const Gwn_ShaderInput* input = GWN_shaderinterface_attr(interface, a->name[n_idx]);
+
+			if (input == NULL) continue;
+
+			if (a->comp_ct == 16 || a->comp_ct == 12 || a->comp_ct == 8)
+				{
 #if TRUST_NO_ONE
-					assert(a->fetch_mode == GWN_FETCH_FLOAT);
-					assert(a->gl_comp_type == GL_FLOAT);
+				assert(a->fetch_mode == GWN_FETCH_FLOAT);
+				assert(a->gl_comp_type == GL_FLOAT);
 #endif
-					for (int i = 0; i < a->comp_ct / 4; ++i)
-						{
-						glEnableVertexAttribArray(input->location + i);
-						glVertexAttribDivisor(input->location + i, (use_instancing) ? 1 : 0);
-						glVertexAttribPointer(input->location + i, 4, a->gl_comp_type, GL_FALSE, stride,
-						                      (const GLubyte*)pointer + i * 16);
-						}
-					}
-				else
+				for (int i = 0; i < a->comp_ct / 4; ++i)
 					{
-					glEnableVertexAttribArray(input->location);
-					glVertexAttribDivisor(input->location, (use_instancing) ? 1 : 0);
+					glEnableVertexAttribArray(input->location + i);
+					glVertexAttribDivisor(input->location + i, (use_instancing) ? 1 : 0);
+					glVertexAttribPointer(input->location + i, 4, a->gl_comp_type, GL_FALSE, stride,
+					                      (const GLubyte*)pointer + i * 16);
+					}
+				}
+			else
+				{
+				glEnableVertexAttribArray(input->location);
+				glVertexAttribDivisor(input->location, (use_instancing) ? 1 : 0);
 
-					switch (a->fetch_mode)
-						{
-						case GWN_FETCH_FLOAT:
-						case GWN_FETCH_INT_TO_FLOAT:
-							glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_FALSE, stride, pointer);
-							break;
-						case GWN_FETCH_INT_TO_FLOAT_UNIT:
-							glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_TRUE, stride, pointer);
-							break;
-						case GWN_FETCH_INT:
-							glVertexAttribIPointer(input->location, a->comp_ct, a->gl_comp_type, stride, pointer);
-						}
+				switch (a->fetch_mode)
+					{
+					case GWN_FETCH_FLOAT:
+					case GWN_FETCH_INT_TO_FLOAT:
+						glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_FALSE, stride, pointer);
+						break;
+					case GWN_FETCH_INT_TO_FLOAT_UNIT:
+						glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_TRUE, stride, pointer);
+						break;
+					case GWN_FETCH_INT:
+						glVertexAttribIPointer(input->location, a->comp_ct, a->gl_comp_type, stride, pointer);
 					}
 				}
 			}
 		}
 	}
 
-static void Batch_update_program_bindings(Gwn_Batch* batch, unsigned int v_first)
+static void batch_update_program_bindings(Gwn_Batch* batch, unsigned int v_first)
 	{
-	// disable all as a precaution
-	// why are we not using prev_attrib_enabled_bits?? see immediate.c
-	for (unsigned a_idx = 0; a_idx < GWN_VERT_ATTR_MAX_LEN; ++a_idx)
-		glDisableVertexAttribArray(a_idx);
+	for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN && batch->verts[v] != NULL; ++v)
+		create_bindings(batch->verts[v], batch->interface, (batch->inst) ? 0 : v_first, false);
 
-	create_bindings(batch, batch->interface, v_first, false);
+	if (batch->inst)
+		create_bindings(batch->inst, batch->interface, v_first, true);
 
-	batch->program_dirty = false;
-	}
-
-static void Batch_update_program_bindings_instancing(Gwn_Batch* batch, Gwn_Batch* batch_instancing, unsigned int instance_first)
-	{
-	// disable all as a precaution
-	// why are we not using prev_attrib_enabled_bits?? see immediate.c
-	for (unsigned a_idx = 0; a_idx < GWN_VERT_ATTR_MAX_LEN; ++a_idx)
-		glDisableVertexAttribArray(a_idx);
-
-	create_bindings(batch, batch->interface, 0, false);
-	if (batch_instancing)
-		create_bindings(batch_instancing, batch->interface, instance_first, true);
-
-	batch->program_dirty = false;
+	if (batch->elem)
+		GWN_indexbuf_use(batch->elem);
 	}
 
 void GWN_batch_program_use_begin(Gwn_Batch* batch)
@@ -290,142 +464,86 @@ void GWN_batch_uniform_4fv(Gwn_Batch* batch, const char* name, const float data[
 	glUniform4fv(uniform->location, 1, data);
 	}
 
-static void Batch_prime(Gwn_Batch* batch)
-	{
-	batch->vao_id = GWN_vao_alloc();
-	glBindVertexArray(batch->vao_id);
-
-	for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN; ++v)
-		{
-		if (batch->verts[v] == NULL)
-			break;
-		GWN_vertbuf_use(batch->verts[v]);
-		}
-
-	if (batch->elem)
-		GWN_indexbuf_use(batch->elem);
-
-	// vertex attribs and element list remain bound to this VAO
-	}
-
 void GWN_batch_draw(Gwn_Batch* batch)
 	{
 #if TRUST_NO_ONE
 	assert(batch->phase == GWN_BATCH_READY_TO_DRAW);
-	assert(glIsProgram(batch->program));
+	assert(batch->verts[0]->vbo_id != 0);
 #endif
-
-	if (batch->vao_id)
-		glBindVertexArray(batch->vao_id);
-	else
-		Batch_prime(batch);
-
-	if (batch->program_dirty)
-		Batch_update_program_bindings(batch, 0);
-
 	GWN_batch_program_use_begin(batch);
+	gpuBindMatrices(batch->interface); // external call.
 
-	gpuBindMatrices(batch->interface);
-
-	if (batch->elem)
-		{
-		const Gwn_IndexBuf* el = batch->elem;
-
-#if GWN_TRACK_INDEX_RANGE
-		if (el->base_index)
-			glDrawRangeElementsBaseVertex(batch->gl_prim_type, el->min_index, el->max_index, el->index_ct, el->gl_index_type, 0, el->base_index);
-		else
-			glDrawRangeElements(batch->gl_prim_type, el->min_index, el->max_index, el->index_ct, el->gl_index_type, 0);
-#else
-		glDrawElements(batch->gl_prim_type, el->index_ct, GL_UNSIGNED_INT, 0);
-#endif
-		}
-	else
-		glDrawArrays(batch->gl_prim_type, 0, batch->verts[0]->vertex_ct);
+	GWN_batch_draw_range_ex(batch, 0, 0, false);
 
 	GWN_batch_program_use_end(batch);
-	glBindVertexArray(0);
 	}
 
-void GWN_batch_draw_stupid(Gwn_Batch* batch, int v_first, int v_count)
-	{
-	if (batch->vao_id)
-		glBindVertexArray(batch->vao_id);
-	else
-		Batch_prime(batch);
-
-	if (batch->program_dirty)
-		Batch_update_program_bindings(batch, v_first);
-
-	// GWN_batch_program_use_begin(batch);
-
-	//gpuBindMatrices(batch->program);
-
-	// Infer lenght if vertex count is not given
-	if (v_count == 0)
-		v_count = (batch->elem) ? batch->elem->index_ct : batch->verts[0]->vertex_ct;
-
-	if (batch->elem)
-		{
-		const Gwn_IndexBuf* el = batch->elem;
-
-#if GWN_TRACK_INDEX_RANGE
-		if (el->base_index)
-			glDrawRangeElementsBaseVertex(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0, el->base_index);
-		else
-			glDrawRangeElements(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0);
-#else
-		glDrawElements(batch->gl_prim_type, v_count, GL_UNSIGNED_INT, 0);
-#endif
-		}
-	else
-		glDrawArrays(batch->gl_prim_type, 0, v_count);
-
-	// GWN_batch_program_use_end(batch);
-	glBindVertexArray(0);
-	}
-
-void GWN_batch_draw_stupid_instanced(Gwn_Batch* batch_instanced, Gwn_Batch* batch_instancing, int instance_first, int instance_count)
+void GWN_batch_draw_range_ex(Gwn_Batch* batch, int v_first, int v_count, bool force_instance)
 	{
 #if TRUST_NO_ONE
-	// batch_instancing can be null if the number of instances is specified.
-	assert(batch_instancing != NULL || instance_count != 0);
+	assert(!(force_instance && (batch->inst == NULL)) || v_count > 0); // we cannot infer length if force_instance
 #endif
-	if (batch_instanced->vao_id)
-		glBindVertexArray(batch_instanced->vao_id);
-	else
-		Batch_prime(batch_instanced);
 
-	if (batch_instanced->program_dirty)
-		Batch_update_program_bindings_instancing(batch_instanced, batch_instancing, instance_first);
-
-	if (instance_count == 0)
-		instance_count = batch_instancing->verts[0]->vertex_ct;
-
-	if (batch_instanced->elem)
+	// If using offset drawing, use the default VAO and redo bindings.
+	if (v_first != 0)
 		{
-		const Gwn_IndexBuf* el = batch_instanced->elem;
-
-#if GWN_TRACK_INDEX_RANGE
-		glDrawElementsInstancedBaseVertex(batch_instanced->gl_prim_type, el->index_ct, el->gl_index_type, 0, instance_count, el->base_index);
-#else
-		glDrawElementsInstanced(batch_instanced->gl_prim_type, el->index_ct, GL_UNSIGNED_INT, 0, instance_count);
-#endif
+		glBindVertexArray(GWN_vao_default());
+		batch_update_program_bindings(batch, v_first);
 		}
 	else
-		glDrawArraysInstanced(batch_instanced->gl_prim_type, 0, batch_instanced->verts[0]->vertex_ct, instance_count);
+		glBindVertexArray(batch->vao_id);
+
+	if (force_instance || batch->inst)
+		{
+		// Infer length if vertex count is not given
+		if (v_count == 0)
+			v_count = batch->inst->vertex_ct;
+
+		if (batch->elem)
+			{
+			const Gwn_IndexBuf* el = batch->elem;
+
+#if GWN_TRACK_INDEX_RANGE
+			glDrawElementsInstancedBaseVertex(batch->gl_prim_type, el->index_ct, el->gl_index_type, 0, v_count, el->base_index);
+#else
+			glDrawElementsInstanced(batch->gl_prim_type, el->index_ct, GL_UNSIGNED_INT, 0, v_count);
+#endif
+			}
+		else
+			glDrawArraysInstanced(batch->gl_prim_type, 0, batch->verts[0]->vertex_ct, v_count);
+		}
+	else
+		{
+		// Infer length if vertex count is not given
+		if (v_count == 0)
+			v_count = (batch->elem) ? batch->elem->index_ct : batch->verts[0]->vertex_ct;
+
+		if (batch->elem)
+			{
+			const Gwn_IndexBuf* el = batch->elem;
+
+#if GWN_TRACK_INDEX_RANGE
+			if (el->base_index)
+				glDrawRangeElementsBaseVertex(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0, el->base_index);
+			else
+				glDrawRangeElements(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0);
+#else
+			glDrawElements(batch->gl_prim_type, v_count, GL_UNSIGNED_INT, 0);
+#endif
+			}
+		else
+			glDrawArrays(batch->gl_prim_type, 0, v_count);
+		}
+
 
 	glBindVertexArray(0);
 	}
 
 // just draw some vertices and let shader place them where we want.
-void GWN_batch_draw_procedural(Gwn_Batch* batch, Gwn_PrimType prim_type, int v_count)
+void GWN_draw_primitive(Gwn_PrimType prim_type, int v_count)
 	{
 	// we cannot draw without vao ... annoying ...
-	if (batch->vao_id)
-		glBindVertexArray(batch->vao_id);
-	else
-		Batch_prime(batch);
+	glBindVertexArray(GWN_vao_default());
 
 	GLenum type = convert_prim_type_to_gl(prim_type);
 	glDrawArrays(type, 0, v_count);
diff --git a/intern/gawain/src/gwn_buffer_id.cpp b/intern/gawain/src/gwn_buffer_id.cpp
index a93c3950d29..64bad855ca7 100644
--- a/intern/gawain/src/gwn_buffer_id.cpp
+++ b/intern/gawain/src/gwn_buffer_id.cpp
@@ -20,7 +20,6 @@
 #endif
 
 static std::vector<GLuint> orphaned_buffer_ids;
-static std::vector<GLuint> orphaned_vao_ids;
 
 static std::mutex orphan_mutex;
 
@@ -36,10 +35,6 @@ static bool thread_is_main()
 
 GLuint GWN_buf_id_alloc()
 	{
-#if TRUST_NO_ONE
-	assert(thread_is_main());
-#endif
-
 	// delete orphaned IDs
 	orphan_mutex.lock();
 	if (!orphaned_buffer_ids.empty())
@@ -73,43 +68,3 @@ void GWN_buf_id_free(GLuint buffer_id)
 		orphan_mutex.unlock();
 		}
 	}
-
-GLuint GWN_vao_alloc()
-	{
-#if TRUST_NO_ONE
-	assert(thread_is_main());
-#endif
-
-	// delete orphaned IDs
-	orphan_mutex.lock();
-	if (!orphaned_vao_ids.empty())
-		{
-		const auto orphaned_vao_ct = (unsigned)orphaned_vao_ids.size();
-#if ORPHAN_DEBUG
-		printf("deleting %u orphaned VAO%s\n", orphaned_vao_ct, orphaned_vao_ct == 1 ? "" : "s");
-#endif
-		glDeleteVertexArrays(orphaned_vao_ct, orphaned_vao_ids.data());
-		orphaned_vao_ids.clear();
-		}
-	orphan_mutex.unlock();
-
-	GLuint new_vao_id = 0;
-	glGenVertexArrays(1, &new_vao_id);
-	return new_vao_id;
-	}
-
-void GWN_vao_free(GLuint vao_id)
-	{
-	if (thread_is_main())
-		glDeleteVertexArrays(1, &vao_id);
-	else
-		{
-		// add this ID to the orphaned list
-		orphan_mutex.lock();
-#if ORPHAN_DEBUG
-		printf("orphaning VAO %u\n", vao_id);
-#endif
-		orphaned_vao_ids.emplace_back(vao_id);
-		orphan_mutex.unlock();
-		}
-	}
diff --git a/intern/gawain/src/gwn_immediate.c b/intern/gawain/src/gwn_immediate.c
index 1c0776d1bbf..f063665b423 100644
--- a/intern/gawain/src/gwn_immediate.c
+++ b/intern/gawain/src/gwn_immediate.c
@@ -14,6 +14,7 @@
 #include "gwn_attr_binding.h"
 #include "gwn_attr_binding_private.h"
 #include "gwn_vertex_format_private.h"
+#include "gwn_vertex_array_id.h"
 #include "gwn_primitive_private.h"
 #include <string.h>
 
@@ -27,6 +28,7 @@ typedef struct {
 #if IMM_BATCH_COMBO
 	Gwn_Batch* batch;
 #endif
+	Gwn_Context* context;
 
 	// current draw call
 	GLubyte* buffer_data;
@@ -86,8 +88,8 @@ void immActivate(void)
 	assert(imm.prim_type == GWN_PRIM_NONE); // make sure we're not between a Begin/End pair
 	assert(imm.vao_id == 0);
 #endif
-
 	imm.vao_id = GWN_vao_alloc();
+	imm.context = GWN_context_active_get();
 	}
 
 void immDeactivate(void)
@@ -97,8 +99,7 @@ void immDeactivate(void)
 	assert(imm.prim_type == GWN_PRIM_NONE); // make sure we're not between a Begin/End pair
 	assert(imm.vao_id != 0);
 #endif
-
-	GWN_vao_free(imm.vao_id);
+	GWN_vao_free(imm.vao_id, imm.context);
 	imm.vao_id = 0;
 	imm.prev_enabled_attrib_bits = 0;
 	}
diff --git a/intern/gawain/src/gwn_shader_interface.c b/intern/gawain/src/gwn_shader_interface.c
index 33821ae36e2..ef3e8f0f3fa 100644
--- a/intern/gawain/src/gwn_shader_interface.c
+++ b/intern/gawain/src/gwn_shader_interface.c
@@ -10,6 +10,7 @@
 // the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
 
 #include "gwn_shader_interface.h"
+#include "gwn_vertex_array_id.h"
 #include <stdlib.h>
 #include <stddef.h>
 #include <string.h>
@@ -263,6 +264,10 @@ Gwn_ShaderInterface* GWN_shaderinterface_create(GLint program)
 #endif
 		}
 
+	// Batches ref buffer
+	shaderface->batches_ct = GWN_SHADERINTERFACE_REF_ALLOC_COUNT;
+	shaderface->batches = calloc(shaderface->batches_ct, sizeof(Gwn_Batch*));
+
 	return shaderface;
 	}
 
@@ -274,6 +279,12 @@ void GWN_shaderinterface_discard(Gwn_ShaderInterface* shaderface)
 	buckets_free(shaderface->ubo_buckets);
 	// Free memory used by name_buffer.
 	free(shaderface->name_buffer);
+	// Remove this interface from all linked Batches vao cache.
+	for (int i = 0; i < shaderface->batches_ct; ++i)
+		if (shaderface->batches[i] != NULL)
+			GWN_batch_remove_interface_ref(shaderface->batches[i], shaderface);
+
+	free(shaderface->batches);
 	// Free memory used by shader interface by its self.
 	free(shaderface);
 	}
@@ -316,3 +327,34 @@ const Gwn_ShaderInput* GWN_shaderinterface_attr(const Gwn_ShaderInterface* shade
 	{
 	return buckets_lookup(shaderface->attrib_buckets, shaderface->name_buffer, name);
 	}
+
+void GWN_shaderinterface_add_batch_ref(Gwn_ShaderInterface* shaderface, Gwn_Batch* batch)
+	{
+	int i; // find first unused slot
+	for (i = 0; i < shaderface->batches_ct; ++i)
+		if (shaderface->batches[i] == NULL)
+			break;
+
+	if (i == shaderface->batches_ct)
+		{
+		// Not enough place, realloc the array.
+		i = shaderface->batches_ct;
+		shaderface->batches_ct += GWN_SHADERINTERFACE_REF_ALLOC_COUNT;
+		shaderface->batches = realloc(shaderface->batches, sizeof(Gwn_Batch*) * shaderface->batches_ct);
+		memset(shaderface->batches + i, 0, sizeof(Gwn_Batch*) * GWN_SHADERINTERFACE_REF_ALLOC_COUNT);
+		}
+
+	shaderface->batches[i] = batch;
+	}
+
+void GWN_shaderinterface_remove_batch_ref(Gwn_ShaderInterface* shaderface, Gwn_Batch* batch)
+	{
+	for (int i = 0; i < shaderface->batches_ct; ++i)
+		{
+		if (shaderface->batches[i] == batch)
+			{
+			shaderface->batches[i] = NULL;
+			break; // cannot have duplicates
+			}
+		}
+	}
diff --git a/intern/gawain/src/gwn_vertex_array_id.cpp b/intern/gawain/src/gwn_vertex_array_id.cpp
index 602c1c4919c..27010f03bc0 100644
--- a/intern/gawain/src/gwn_vertex_array_id.cpp
+++ b/intern/gawain/src/gwn_vertex_array_id.cpp
@@ -109,7 +109,7 @@ GLuint GWN_vao_default(void)
 	return active_ctx->default_vao;
 	}
 
-GLuint GWN_vao_alloc_new(void)
+GLuint GWN_vao_alloc(void)
 	{
 #if TRUST_NO_ONE
 	assert(active_ctx); // need at least an active context
@@ -123,7 +123,7 @@ GLuint GWN_vao_alloc_new(void)
 	}
 
 // this can be called from multiple thread
-void GWN_vao_free_new(GLuint vao_id, Gwn_Context* ctx)
+void GWN_vao_free(GLuint vao_id, Gwn_Context* ctx)
 	{
 	if (ctx == active_ctx)
 		glDeleteVertexArrays(1, &vao_id);
diff --git a/source/blender/draw/intern/DRW_render.h b/source/blender/draw/intern/DRW_render.h
index f62b224b094..82ba2922dd0 100644
--- a/source/blender/draw/intern/DRW_render.h
+++ b/source/blender/draw/intern/DRW_render.h
@@ -341,7 +341,7 @@ typedef void (DRWCallGenerateFn)(
         void (*draw_fn)(DRWShadingGroup *shgroup, struct Gwn_Batch *geom),
         void *user_data);
 
-void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *instances);
+void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *batch);
 
 void DRW_shgroup_free(struct DRWShadingGroup *shgroup);
 void DRW_shgroup_call_add(DRWShadingGroup *shgroup, struct Gwn_Batch *geom, float (*obmat)[4]);
diff --git a/source/blender/draw/intern/draw_instance_data.c b/source/blender/draw/intern/draw_instance_data.c
index c2aae8e33ae..bfff1a2f546 100644
--- a/source/blender/draw/intern/draw_instance_data.c
+++ b/source/blender/draw/intern/draw_instance_data.c
@@ -42,12 +42,29 @@
 #define BUFFER_CHUNK_SIZE 32
 #define BUFFER_VERTS_CHUNK 32
 
-typedef struct DRWInstanceBuffer {
+typedef struct DRWBatchingBuffer {
 	struct DRWShadingGroup *shgroup;  /* Link back to the owning shGroup. Also tells if it's used */
 	Gwn_VertFormat *format;           /* Identifier. */
 	Gwn_VertBuf *vert;                /* Gwn_VertBuf contained in the Gwn_Batch. */
 	Gwn_Batch *batch;                 /* Gwn_Batch containing the Gwn_VertBuf. */
-} DRWInstanceBuffer;
+} DRWBatchingBuffer;
+
+typedef struct DRWInstancingBuffer {
+	struct DRWShadingGroup *shgroup;  /* Link back to the owning shGroup. Also tells if it's used */
+	Gwn_VertFormat *format;           /* Identifier. */
+	Gwn_Batch *instance;              /* Identifier. */
+	Gwn_VertBuf *vert;                /* Gwn_VertBuf contained in the Gwn_Batch. */
+	Gwn_Batch *batch;                 /* Gwn_Batch containing the Gwn_VertBuf. */
+} DRWInstancingBuffer;
+
+typedef struct DRWInstanceChunk {
+	size_t cursor;             /* Offset to the next instance data. */
+	size_t alloc_size;         /* Number of DRWBatchingBuffer/Batches alloc'd in ibufs/btchs. */
+	union {
+		DRWBatchingBuffer *bbufs;
+		DRWInstancingBuffer *ibufs;
+	};
+} DRWInstanceChunk;
 
 struct DRWInstanceData {
 	struct DRWInstanceData *next;
@@ -60,19 +77,19 @@ struct DRWInstanceData {
 };
 
 struct DRWInstanceDataList {
+	struct DRWInstanceDataList *next, *prev;
 	/* Linked lists for all possible data pool size */
 	/* Not entirely sure if we should separate them in the first place.
 	 * This is done to minimize the reattribution misses. */
 	DRWInstanceData *idata_head[MAX_INSTANCE_DATA_SIZE];
 	DRWInstanceData *idata_tail[MAX_INSTANCE_DATA_SIZE];
 
-	struct {
-		size_t cursor;             /* Offset to the next instance data. */
-		size_t alloc_size;         /* Number of DRWInstanceBuffer alloc'd in ibufs. */
-		DRWInstanceBuffer *ibufs;
-	} ibuffers;
+	DRWInstanceChunk instancing;
+	DRWInstanceChunk batching;
 };
 
+static ListBase g_idatalists = {NULL, NULL};
+
 /* -------------------------------------------------------------------- */
 
 /** \name Instance Buffer Management
@@ -87,89 +104,174 @@ struct DRWInstanceDataList {
  * that would be too slow]).
  **/
 
-void DRW_instance_buffer_request(
-        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, struct DRWShadingGroup *shgroup,
-        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert, Gwn_PrimType type)
+static void instance_batch_free(Gwn_Batch *batch, void *UNUSED(user_data))
 {
-	BLI_assert(format);
-
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
-	int first_non_alloced = -1;
-
-	/* Search for an unused batch. */
-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
-		if (ibuf->shgroup == NULL) {
-			if (ibuf->format == format) {
-				ibuf->shgroup = shgroup;
-				*r_batch = ibuf->batch;
-				*r_vert = ibuf->vert;
-				return;
-			}
-			else if (ibuf->format == NULL && first_non_alloced == -1) {
-				first_non_alloced = i;
+	/* Free all batches that have the same key before they are reused. */
+	/* TODO: Make it thread safe! Batch freeing can happen from another thread. */
+	/* XXX we need to iterate over all idatalists unless we make some smart
+	 * data structure to store the locations to update. */
+	for (DRWInstanceDataList *idatalist = g_idatalists.first; idatalist; ++idatalist) {
+		DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+		for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
+			if (ibuf->instance == batch) {
+				BLI_assert(ibuf->shgroup == NULL); /* Make sure it has no other users. */
+				GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
+				GWN_BATCH_DISCARD_SAFE(ibuf->batch);
+				/* Tag as non alloced. */
+				ibuf->format = NULL;
 			}
 		}
 	}
+}
 
-	if (first_non_alloced == -1) {
-		/* There is no batch left. Allocate more. */
-		first_non_alloced = idatalist->ibuffers.alloc_size;
-		idatalist->ibuffers.alloc_size += BUFFER_CHUNK_SIZE;
-		idatalist->ibuffers.ibufs = MEM_reallocN(idatalist->ibuffers.ibufs,
-		                                         idatalist->ibuffers.alloc_size * sizeof(DRWInstanceBuffer));
-		/* Clear new part of the memory. */
-		memset(idatalist->ibuffers.ibufs + first_non_alloced, 0, sizeof(DRWInstanceBuffer) * BUFFER_CHUNK_SIZE);
+void DRW_batching_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_PrimType type, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert)
+{
+	DRWInstanceChunk *chunk = &idatalist->batching;
+	DRWBatchingBuffer *bbuf = idatalist->batching.bbufs;
+	BLI_assert(format);
+	/* Search for an unused batch. */
+	for (int i = 0; i < idatalist->batching.alloc_size; i++, bbuf++) {
+		if (bbuf->shgroup == NULL) {
+			if (bbuf->format == format) {
+				bbuf->shgroup = shgroup;
+				*r_batch = bbuf->batch;
+				*r_vert = bbuf->vert;
+				return;
+			}
+		}
+	}
+	int new_id = 0; /* Find insertion point. */
+	for (; new_id < chunk->alloc_size; ++new_id) {
+		if (chunk->bbufs[new_id].format == NULL)
+			break;
+	}
+	/* If there is no batch left. Allocate more. */
+	if (new_id == chunk->alloc_size) {
+		new_id = chunk->alloc_size;
+		chunk->alloc_size += BUFFER_CHUNK_SIZE;
+		chunk->bbufs = MEM_reallocN(chunk->bbufs, chunk->alloc_size * sizeof(DRWBatchingBuffer));
+		memset(chunk->bbufs + new_id, 0, sizeof(DRWBatchingBuffer) * BUFFER_CHUNK_SIZE);
 	}
-
 	/* Create the batch. */
-	ibuf = idatalist->ibuffers.ibufs + first_non_alloced;
+	bbuf = chunk->bbufs + new_id;
+	bbuf->vert = *r_vert = GWN_vertbuf_create_dynamic_with_format(format);
+	bbuf->batch = *r_batch = GWN_batch_create_ex(type, bbuf->vert, NULL, 0);
+	bbuf->format = format;
+	bbuf->shgroup = shgroup;
+	GWN_vertbuf_data_alloc(*r_vert, BUFFER_VERTS_CHUNK);
+}
+
+void DRW_instancing_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_Batch *instance, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert)
+{
+	DRWInstanceChunk *chunk = &idatalist->instancing;
+	DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+	BLI_assert(format);
+	/* Search for an unused batch. */
+	for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
+		if (ibuf->shgroup == NULL) {
+			if (ibuf->format == format) {
+				if (ibuf->instance == instance) {
+					ibuf->shgroup = shgroup;
+					*r_batch = ibuf->batch;
+					*r_vert = ibuf->vert;
+					return;
+				}
+			}
+		}
+	}
+	int new_id = 0; /* Find insertion point. */
+	for (; new_id < chunk->alloc_size; ++new_id) {
+		if (chunk->ibufs[new_id].format == NULL)
+			break;
+	}
+	/* If there is no batch left. Allocate more. */
+	if (new_id == chunk->alloc_size) {
+		new_id = chunk->alloc_size;
+		chunk->alloc_size += BUFFER_CHUNK_SIZE;
+		chunk->ibufs = MEM_reallocN(chunk->ibufs, chunk->alloc_size * sizeof(DRWInstancingBuffer));
+		memset(chunk->ibufs + new_id, 0, sizeof(DRWInstancingBuffer) * BUFFER_CHUNK_SIZE);
+	}
+	/* Create the batch. */
+	ibuf = chunk->ibufs + new_id;
 	ibuf->vert = *r_vert = GWN_vertbuf_create_dynamic_with_format(format);
-	ibuf->batch = *r_batch = GWN_batch_create_ex(type, ibuf->vert, NULL, GWN_BATCH_OWNS_VBO);
+	ibuf->batch = *r_batch = GWN_batch_duplicate(instance);
 	ibuf->format = format;
 	ibuf->shgroup = shgroup;
-
+	ibuf->instance = instance;
 	GWN_vertbuf_data_alloc(*r_vert, BUFFER_VERTS_CHUNK);
+	GWN_batch_instbuf_set(ibuf->batch, ibuf->vert, false);
+	/* Make sure to free this ibuf if the instance batch gets free. */
+	GWN_batch_callback_free_set(instance, &instance_batch_free, NULL);
 }
 
 void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist)
 {
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
-	size_t minimum_alloc_size = 1; /* Avoid 0 size realloc. */
-
+	size_t realloc_size = 1; /* Avoid 0 size realloc. */
 	/* Resize down buffers in use and send data to GPU & free unused buffers. */
-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
+	DRWInstanceChunk *batching = &idatalist->batching;
+	DRWBatchingBuffer *bbuf = batching->bbufs;
+	for (int i = 0; i < batching->alloc_size; i++, bbuf++) {
+		if (bbuf->shgroup != NULL) {
+			realloc_size = i + 1;
+			unsigned int vert_ct = DRW_shgroup_get_instance_count(bbuf->shgroup);
+			vert_ct += (vert_ct == 0) ? 1 : 0; /* Do not realloc to 0 size buffer */
+			if (vert_ct + BUFFER_VERTS_CHUNK <= bbuf->vert->vertex_ct) {
+				unsigned int size = vert_ct + BUFFER_VERTS_CHUNK - 1;
+				size = size - size % BUFFER_VERTS_CHUNK;
+				GWN_vertbuf_data_resize(bbuf->vert, size);
+			}
+			GWN_vertbuf_use(bbuf->vert); /* Send data. */
+			bbuf->shgroup = NULL; /* Set as non used for the next round. */
+		}
+		else {
+			GWN_VERTBUF_DISCARD_SAFE(bbuf->vert);
+			GWN_BATCH_DISCARD_SAFE(bbuf->batch);
+			bbuf->format = NULL; /* Tag as non alloced. */
+		}
+	}
+	/* Rounding up to nearest chunk size. */
+	realloc_size += BUFFER_CHUNK_SIZE - 1;
+	realloc_size -= realloc_size % BUFFER_CHUNK_SIZE;
+	/* Resize down if necessary. */
+	if (realloc_size < batching->alloc_size) {
+		batching->alloc_size = realloc_size;
+		batching->ibufs = MEM_reallocN(batching->ibufs, realloc_size * sizeof(DRWBatchingBuffer));
+	}
+
+	realloc_size = 1;
+	/* Resize down buffers in use and send data to GPU & free unused buffers. */
+	DRWInstanceChunk *instancing = &idatalist->instancing;
+	DRWInstancingBuffer *ibuf = instancing->ibufs;
+	for (int i = 0; i < instancing->alloc_size; i++, ibuf++) {
 		if (ibuf->shgroup != NULL) {
-			minimum_alloc_size = i + 1;
+			realloc_size = i + 1;
 			unsigned int vert_ct = DRW_shgroup_get_instance_count(ibuf->shgroup);
-			/* Do not realloc to 0 size buffer */
-			vert_ct += (vert_ct == 0) ? 1 : 0;
-			/* Resize buffer to reclame space. */
+			vert_ct += (vert_ct == 0) ? 1 : 0; /* Do not realloc to 0 size buffer */
 			if (vert_ct + BUFFER_VERTS_CHUNK <= ibuf->vert->vertex_ct) {
 				unsigned int size = vert_ct + BUFFER_VERTS_CHUNK - 1;
 				size = size - size % BUFFER_VERTS_CHUNK;
 				GWN_vertbuf_data_resize(ibuf->vert, size);
 			}
-			/* Send data. */
-			GWN_vertbuf_use(ibuf->vert);
-			/* Set as non used for the next round. */
-			ibuf->shgroup = NULL;
+			GWN_vertbuf_use(ibuf->vert); /* Send data. */
+			ibuf->shgroup = NULL; /* Set as non used for the next round. */
 		}
 		else {
+			GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
 			GWN_BATCH_DISCARD_SAFE(ibuf->batch);
-			/* Tag as non alloced. */
-			ibuf->format = NULL;
+			ibuf->format = NULL; /* Tag as non alloced. */
 		}
 	}
-
-	/* Resize down the handle buffer (ibuffers). */
 	/* Rounding up to nearest chunk size. */
-	minimum_alloc_size += BUFFER_CHUNK_SIZE - 1;
-	minimum_alloc_size -= minimum_alloc_size % BUFFER_CHUNK_SIZE;
+	realloc_size += BUFFER_CHUNK_SIZE - 1;
+	realloc_size -= realloc_size % BUFFER_CHUNK_SIZE;
 	/* Resize down if necessary. */
-	if (minimum_alloc_size < idatalist->ibuffers.alloc_size) {
-		idatalist->ibuffers.alloc_size = minimum_alloc_size;
-		idatalist->ibuffers.ibufs = MEM_reallocN(idatalist->ibuffers.ibufs,
-		                                         minimum_alloc_size * sizeof(DRWInstanceBuffer));
+	if (realloc_size < instancing->alloc_size) {
+		instancing->alloc_size = realloc_size;
+		instancing->ibufs = MEM_reallocN(instancing->ibufs, realloc_size * sizeof(DRWInstancingBuffer));
 	}
 }
 
@@ -183,7 +285,7 @@ void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist)
 static DRWInstanceData *drw_instance_data_create(
         DRWInstanceDataList *idatalist, unsigned int attrib_size, unsigned int instance_group)
 {
-	DRWInstanceData *idata = MEM_mallocN(sizeof(DRWInstanceData), "DRWInstanceData");
+	DRWInstanceData *idata = MEM_callocN(sizeof(DRWInstanceData), "DRWInstanceData");
 	idata->next = NULL;
 	idata->used = true;
 	idata->data_size = attrib_size;
@@ -263,15 +365,18 @@ DRWInstanceData *DRW_instance_data_request(
 DRWInstanceDataList *DRW_instance_data_list_create(void)
 {
 	DRWInstanceDataList *idatalist = MEM_callocN(sizeof(DRWInstanceDataList), "DRWInstanceDataList");
-	idatalist->ibuffers.ibufs = MEM_callocN(sizeof(DRWInstanceBuffer) * BUFFER_CHUNK_SIZE, "DRWInstanceBuffers");
-	idatalist->ibuffers.alloc_size = BUFFER_CHUNK_SIZE;
+	idatalist->batching.bbufs = MEM_callocN(sizeof(DRWBatchingBuffer) * BUFFER_CHUNK_SIZE, "DRWBatchingBuffers");
+	idatalist->batching.alloc_size = BUFFER_CHUNK_SIZE;
+	idatalist->instancing.ibufs = MEM_callocN(sizeof(DRWInstancingBuffer) * BUFFER_CHUNK_SIZE, "DRWInstancingBuffers");
+	idatalist->instancing.alloc_size = BUFFER_CHUNK_SIZE;
+
+	BLI_addtail(&g_idatalists, idatalist);
 
 	return idatalist;
 }
 
 void DRW_instance_data_list_free(DRWInstanceDataList *idatalist)
 {
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
 	DRWInstanceData *idata, *next_idata;
 
 	for (int i = 0; i < MAX_INSTANCE_DATA_SIZE; ++i) {
@@ -284,10 +389,21 @@ void DRW_instance_data_list_free(DRWInstanceDataList *idatalist)
 		idatalist->idata_tail[i] = NULL;
 	}
 
-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
+	DRWBatchingBuffer *bbuf = idatalist->batching.bbufs;
+	for (int i = 0; i < idatalist->batching.alloc_size; i++, bbuf++) {
+		GWN_VERTBUF_DISCARD_SAFE(bbuf->vert);
+		GWN_BATCH_DISCARD_SAFE(bbuf->batch);
+	}
+	MEM_freeN(idatalist->batching.bbufs);
+
+	DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+	for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
+		GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
 		GWN_BATCH_DISCARD_SAFE(ibuf->batch);
 	}
-	MEM_freeN(idatalist->ibuffers.ibufs);
+	MEM_freeN(idatalist->instancing.ibufs);
+
+	BLI_remlink(&g_idatalists, idatalist);
 }
 
 void DRW_instance_data_list_reset(DRWInstanceDataList *idatalist)
diff --git a/source/blender/draw/intern/draw_instance_data.h b/source/blender/draw/intern/draw_instance_data.h
index a7a66c9baff..3b0f7839277 100644
--- a/source/blender/draw/intern/draw_instance_data.h
+++ b/source/blender/draw/intern/draw_instance_data.h
@@ -43,9 +43,12 @@ void *DRW_instance_data_get(DRWInstanceData *idata);
 DRWInstanceData *DRW_instance_data_request(
         DRWInstanceDataList *idatalist, unsigned int attrib_size, unsigned int instance_group);
 
-void DRW_instance_buffer_request(
-        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, struct DRWShadingGroup *shgroup,
-        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert, Gwn_PrimType type);
+void DRW_batching_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_PrimType type, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert);
+void DRW_instancing_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_Batch *instance, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert);
 
 /* Upload all instance data to the GPU as soon as possible. */
 void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist);
diff --git a/source/blender/draw/intern/draw_manager.c b/source/blender/draw/intern/draw_manager.c
index a3a59efc799..5299fa04e4e 100644
--- a/source/blender/draw/intern/draw_manager.c
+++ b/source/blender/draw/intern/draw_manager.c
@@ -665,6 +665,24 @@ static void drw_interface_init(DRWInterface *interface, GPUShader *shader)
 }
 
 static void drw_interface_instance_init(
+        DRWShadingGroup *shgroup, GPUShader *shader, Gwn_Batch *batch, Gwn_VertFormat *format)
+{
+	DRWInterface *interface = &shgroup->interface;
+	drw_interface_init(interface, shader);
+
+#ifndef NDEBUG
+	interface->attribs_count = (format != NULL) ? format->attrib_ct : 0;
+#endif
+	BLI_assert(shgroup->type == DRW_SHG_INSTANCE);
+	BLI_assert(shgroup->instance_geom != NULL);
+
+	if (format != NULL) {
+		DRW_instancing_buffer_request(DST.idatalist, format, batch, shgroup,
+		                              &shgroup->instancing_geom, &interface->instance_vbo);
+	}
+}
+
+static void drw_interface_batching_init(
         DRWShadingGroup *shgroup, GPUShader *shader, Gwn_VertFormat *format)
 {
 	DRWInterface *interface = &shgroup->interface;
@@ -673,36 +691,19 @@ static void drw_interface_instance_init(
 #ifndef NDEBUG
 	interface->attribs_count = (format != NULL) ? format->attrib_ct : 0;
 #endif
+	BLI_assert(format != NULL);
 
 	Gwn_PrimType type;
-	Gwn_Batch **r_batch = NULL;
 	switch (shgroup->type) {
-		case DRW_SHG_INSTANCE:
-			r_batch = &shgroup->instancing_geom;
-			type = GWN_PRIM_POINTS;
-			break;
-		case DRW_SHG_POINT_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_POINTS;
-			break;
-		case DRW_SHG_LINE_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_LINES;
-			break;
-		case DRW_SHG_TRIANGLE_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_TRIS;
-			break;
+		case DRW_SHG_POINT_BATCH: type = GWN_PRIM_POINTS; break;
+		case DRW_SHG_LINE_BATCH: type = GWN_PRIM_LINES; break;
+		case DRW_SHG_TRIANGLE_BATCH: type = GWN_PRIM_TRIS; break;
 		default:
 			BLI_assert(0);
 	}
 
-	if (format != NULL) {
-		DRW_instance_buffer_request(DST.idatalist, format, shgroup, r_batch, &interface->instance_vbo, type);
-	}
-	else {
-		*r_batch = NULL;
-	}
+	DRW_batching_buffer_request(DST.idatalist, format, type, shgroup,
+	                            &shgroup->batch_geom, &interface->instance_vbo);
 }
 
 static void drw_interface_uniform(DRWShadingGroup *shgroup, const char *name,
@@ -882,7 +883,7 @@ DRWShadingGroup *DRW_shgroup_material_instance_create(
 		shgroup->type = DRW_SHG_INSTANCE;
 		shgroup->instance_geom = geom;
 		shgroup->instance_data = ob->data;
-		drw_interface_instance_init(shgroup, GPU_pass_shader(gpupass), format);
+		drw_interface_instance_init(shgroup, GPU_pass_shader(gpupass), geom, format);
 		drw_shgroup_material_inputs(shgroup, material, gpupass);
 	}
 
@@ -890,7 +891,7 @@ DRWShadingGroup *DRW_shgroup_material_instance_create(
 }
 
 DRWShadingGroup *DRW_shgroup_material_empty_tri_batch_create(
-        struct GPUMaterial *material, DRWPass *pass, int size)
+        struct GPUMaterial *material, DRWPass *pass, int tri_count)
 {
 #ifdef USE_GPU_SELECT
 	BLI_assert((G.f & G_PICKSEL) == 0);
@@ -899,10 +900,10 @@ DRWShadingGroup *DRW_shgroup_material_empty_tri_batch_create(
 	DRWShadingGroup *shgroup = drw_shgroup_material_create_ex(gpupass, pass);
 
 	if (shgroup) {
-		shgroup->type = DRW_SHG_TRIANGLE_BATCH;
-		shgroup->interface.instance_count = size * 3;
-		/* Calling drw_interface_init will cause it to GWN_batch_draw_procedural. */
+		/* Calling drw_interface_init will cause it to call GWN_draw_primitive(). */
 		drw_interface_init(&shgroup->interface, GPU_pass_shader(gpupass));
+		shgroup->type = DRW_SHG_TRIANGLE_BATCH;
+		shgroup->interface.instance_count = tri_count * 3;
 		drw_shgroup_material_inputs(shgroup, material, gpupass);
 	}
 
@@ -923,7 +924,7 @@ DRWShadingGroup *DRW_shgroup_instance_create(
 	shgroup->type = DRW_SHG_INSTANCE;
 	shgroup->instance_geom = geom;
 
-	drw_interface_instance_init(shgroup, shader, format);
+	drw_interface_instance_init(shgroup, shader, geom, format);
 
 	return shgroup;
 }
@@ -937,7 +938,7 @@ DRWShadingGroup *DRW_shgroup_point_batch_create(struct GPUShader *shader, DRWPas
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);
 	shgroup->type = DRW_SHG_POINT_BATCH;
 
-	drw_interface_instance_init(shgroup, shader, g_pos_format);
+	drw_interface_batching_init(shgroup, shader, g_pos_format);
 
 	return shgroup;
 }
@@ -949,7 +950,7 @@ DRWShadingGroup *DRW_shgroup_line_batch_create(struct GPUShader *shader, DRWPass
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);
 	shgroup->type = DRW_SHG_LINE_BATCH;
 
-	drw_interface_instance_init(shgroup, shader, g_pos_format);
+	drw_interface_batching_init(shgroup, shader, g_pos_format);
 
 	return shgroup;
 }
@@ -957,18 +958,18 @@ DRWShadingGroup *DRW_shgroup_line_batch_create(struct GPUShader *shader, DRWPass
 /* Very special batch. Use this if you position
  * your vertices with the vertex shader
  * and dont need any VBO attrib */
-DRWShadingGroup *DRW_shgroup_empty_tri_batch_create(struct GPUShader *shader, DRWPass *pass, int size)
+DRWShadingGroup *DRW_shgroup_empty_tri_batch_create(struct GPUShader *shader, DRWPass *pass, int tri_count)
 {
 #ifdef USE_GPU_SELECT
 	BLI_assert((G.f & G_PICKSEL) == 0);
 #endif
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);
 
-	/* Calling drw_interface_init will cause it to GWN_batch_draw_procedural. */
+	/* Calling drw_interface_init will cause it to call GWN_draw_primitive(). */
 	drw_interface_init(&shgroup->interface, shader);
 
 	shgroup->type = DRW_SHG_TRIANGLE_BATCH;
-	shgroup->interface.instance_count = size * 3;
+	shgroup->interface.instance_count = tri_count * 3;
 
 	return shgroup;
 }
@@ -991,13 +992,19 @@ void DRW_shgroup_free(struct DRWShadingGroup *UNUSED(shgroup))
 } ((void)0)
 
 /* Specify an external batch instead of adding each attrib one by one. */
-void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *instances)
+void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *batch)
 {
 	BLI_assert(shgroup->type == DRW_SHG_INSTANCE);
-	BLI_assert(shgroup->instancing_geom == NULL);
+	BLI_assert(shgroup->interface.instance_count == 0);
+	/* You cannot use external instancing batch without a dummy format. */
+	BLI_assert(shgroup->instancing_geom != NULL);
 
 	shgroup->type = DRW_SHG_INSTANCE_EXTERNAL;
-	shgroup->instancing_geom = instances;
+	/* PERF : This destroys the vaos cache so better check if it's necessary. */
+	/* Note: This WILL break if batch->verts[0] is destroyed and reallocated
+	 * at the same adress. Bindings/VAOs would remain obsolete. */
+	//if (shgroup->instancing_geom->inst != batch->verts[0])
+	GWN_batch_instbuf_set(shgroup->instancing_geom, batch->verts[0], false);
 
 #ifdef USE_GPU_SELECT
 	DRWCall *call = BLI_mempool_alloc(DST.vmempool->calls);
@@ -1140,8 +1147,6 @@ void DRW_shgroup_set_instance_count(DRWShadingGroup *shgroup, unsigned int count
 
 unsigned int DRW_shgroup_get_instance_count(const DRWShadingGroup *shgroup)
 {
-	BLI_assert(shgroup->type != DRW_SHG_NORMAL && shgroup->type != DRW_SHG_INSTANCE_EXTERNAL);
-
 	return shgroup->interface.instance_count;
 }
 
@@ -1765,18 +1770,17 @@ static void draw_geometry_execute_ex(
 	if (geom == NULL) {
 		BLI_assert(shgroup->type == DRW_SHG_TRIANGLE_BATCH); /* Add other type if needed. */
 		/* Shader is already bound. */
-		Gwn_Batch *batch = DRW_cache_fullscreen_quad_get();
-		GWN_batch_draw_procedural(batch, GWN_PRIM_TRIS, count);
+		GWN_draw_primitive(GWN_PRIM_TRIS, count);
 		return;
 	}
 
 	/* step 2 : bind vertex array & draw */
 	GWN_batch_program_set(geom, GPU_shader_get_program(shgroup->shader), GPU_shader_get_interface(shgroup->shader));
 	if (ELEM(shgroup->type, DRW_SHG_INSTANCE, DRW_SHG_INSTANCE_EXTERNAL)) {
-		GWN_batch_draw_stupid_instanced(geom, shgroup->instancing_geom, start, count);
+		GWN_batch_draw_range_ex(geom, start, count, true);
 	}
 	else {
-		GWN_batch_draw_stupid(geom, start, count);
+		GWN_batch_draw_range(geom, start, count);
 	}
 	/* XXX this just tells gawain we are done with the shader.
 	 * This does not unbind the shader. */
@@ -1998,7 +2002,7 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
 			if (shgroup->type == DRW_SHG_INSTANCE_EXTERNAL) {
 				if (shgroup->instancing_geom != NULL) {
 					GPU_SELECT_LOAD_IF_PICKSEL((DRWCall *)shgroup->calls_first);
-					draw_geometry(shgroup, shgroup->instance_geom, obmat, shgroup->instance_data, 0, 0);
+					draw_geometry(shgroup, shgroup->instancing_geom, obmat, shgroup->instance_data, 0, 0);
 				}
 			}
 			else {
@@ -2006,13 +2010,15 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
 					unsigned int count, start;
 					GPU_SELECT_LOAD_IF_PICKSEL_LIST(shgroup, start, count)
 					{
-						draw_geometry(shgroup, shgroup->instance_geom, obmat, shgroup->instance_data, start, count);
+						draw_geometry(shgroup,
+						              (shgroup->instancing_geom) ? shgroup->instancing_geom : shgroup->instance_geom,
+						              obmat, shgroup->instance_data, start, count);
 					}
 					GPU_SELECT_LOAD_IF_PICKSEL_LIST_END(start, count)
 				}
 			}
 		}
-		else {
+		else { /* DRW_SHG_***_BATCH */
 			/* Some dynamic batch can have no geom (no call to aggregate) */
 			if (shgroup->interface.instance_count > 0) {
 				unsigned int count, start;
diff --git a/source/blender/draw/modes/object_mode.c b/source/blender/draw/modes/object_mode.c
index 4a7a5d25b11..d6c0369b0a5 100644
--- a/source/blender/draw/modes/object_mode.c
+++ b/source/blender/draw/modes/object_mode.c
@@ -218,6 +218,7 @@ typedef struct OBJECT_PrivateData {
 
 static struct {
 	/* Instance Data format */
+	struct Gwn_VertFormat *particle_format;
 	struct Gwn_VertFormat *empty_image_format;
 	struct Gwn_VertFormat *empty_image_wire_format;
 
@@ -537,6 +538,7 @@ static void OBJECT_engine_init(void *vedata)
 
 static void OBJECT_engine_free(void)
 {
+	MEM_SAFE_FREE(e_data.particle_format);
 	MEM_SAFE_FREE(e_data.empty_image_format);
 	MEM_SAFE_FREE(e_data.empty_image_wire_format);
 	DRW_SHADER_FREE_SAFE(e_data.outline_resolve_sh);
@@ -1752,6 +1754,9 @@ static void OBJECT_cache_populate_particles(Object *ob,
 				static float def_prim_col[3] = {0.5f, 0.5f, 0.5f};
 				static float def_sec_col[3] = {1.0f, 1.0f, 1.0f};
 
+				/* Dummy particle format for instancing to work. */
+				DRW_shgroup_instance_format(e_data.particle_format, {{"dummy", DRW_ATTRIB_FLOAT, 1}});
+
 				Material *ma = give_current_material(ob, part->omat);
 
 				switch (draw_as) {
@@ -1766,21 +1771,24 @@ static void OBJECT_cache_populate_particles(Object *ob,
 						break;
 					case PART_DRAW_CROSS:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CROSS), NULL);
+						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CROSS),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_texture(shgrp, "ramp", globals_ramp);
 						DRW_shgroup_uniform_vec3(shgrp, "color", ma ? &ma->r : def_prim_col, 1);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[0], 1);
 						break;
 					case PART_DRAW_CIRC:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CIRC), NULL);
+						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CIRC),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_texture(shgrp, "ramp", globals_ramp);
 						DRW_shgroup_uniform_vec3(shgrp, "color", ma ? &ma->r : def_prim_col, 1);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[1], 1);
 						break;
 					case PART_DRAW_AXIS:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_axis_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_AXIS), NULL);
+						        e_data.part_axis_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_AXIS),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[0], 1);
 						break;
 					default: