amd-builtins/devenq/enqueue.cl - libclc - Git at Google

 /*
  * Copyright (c) 2014 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
  * in the Software without restriction, including without limitation the rights
  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  * copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */

 #if __OPENCL_C_VERSION__ >= 200

 #include "devenq.h"

 static inline void
 copy_waitlist(__global AmdEvent **dst, __global AmdEvent **src, uint n)
 {
     uint i;
     for (i=0; i<n; ++i)
         dst[i] = src[i];
 }

 #ifdef __clang__
 __attribute__((overloadable))
 #endif
 __attribute__((always_inline)) queue_t
 get_default_queue(void)
 {
     return (queue_t)get_vqueue();
 }

 #ifdef __clang__
 __attribute__((overloadable))
 #endif
 __attribute__((always_inline)) int
 enqueue_marker(queue_t q, uint nwl, const clk_event_t *wl, clk_event_t *re)
 {
     __global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q;
     if (nwl > vq->wait_size)
         return CLK_ENQUEUE_FAILURE;

     // Get a wrap slot
     __global uint *amask = (__global uint *)vq->aql_slot_mask;
     int ai = reserve_slot(amask, vq->aql_slot_num);
     if (ai < 0)
         return CLK_ENQUEUE_FAILURE;

     // Get a return event slot
     __global uint *emask = (__global uint *)vq->event_slot_mask;
     int ei = reserve_slot(emask, vq->event_slot_num);
     if (ei < 0) {
 	release_slot(amask, ai);
         return CLK_ENQUEUE_FAILURE;
     }

     // Initialize return event
     __global AmdEvent *ev = (__global AmdEvent *)vq->event_slots + ei;
     ev->state = CL_SUBMITTED;
     ev->counter = 1;
     ev->timer[0] = 0;
     ev->timer[1] = 0;
     ev->timer[2] = 0;

     // Initialize wrap
     __global AmdAqlWrap *me = get_aql_wrap();
     __global AmdAqlWrap *aw = (__global AmdAqlWrap *)(vq + 1) + ai;

     aw->enqueue_flags = CLK_ENQUEUE_FLAGS_NO_WAIT;
     aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device);
     aw->child_counter = 0;
     aw->completion = (ulong)ev;
     aw->parent_wrap = (ulong)me;

     if (nwl > 0)
         copy_waitlist((__global AmdEvent **)aw->wait_list, (__global AmdEvent **)wl, nwl);

     aw->wait_num = nwl;

     // A marker is never enqueued so ignore displatch packet

     // Tell the scheduler
     atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device);
     atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_MARKER, memory_order_release, memory_scope_device);

     *re = (clk_event_t)ev;
     return 0;
 }

 // int
 // __enqueue_internal_{0,1,.,10}[_events] (
 //   queue_t q,
 //   int flags,
 //   int dims, size_t goff[3], size_t gsize[3], size_t lsize[3],
 //   __global void * something_like_function_pointer,
 //   __global void * wrap_ptr_from_prep
 //   [, uint size0, uint align0
 //    [, uint size1, uint align1
 //     [, uint size2, uint align2
 //      [, uint size3, uint align3
 //       ...]]]]]] );

 // Help with size and alignment handling
 #define _SA_ARGS10 _SA_ARGS9, uint sz9, uint al9
 #define _SA_ARGS9  _SA_ARGS8, uint sz8, uint al8
 #define _SA_ARGS8  _SA_ARGS7, uint sz7, uint al7
 #define _SA_ARGS7  _SA_ARGS6, uint sz6, uint al6
 #define _SA_ARGS6  _SA_ARGS5, uint sz5, uint al5
 #define _SA_ARGS5  _SA_ARGS4, uint sz4, uint al4
 #define _SA_ARGS4  _SA_ARGS3, uint sz3, uint al3
 #define _SA_ARGS3  _SA_ARGS2, uint sz2, uint al2
 #define _SA_ARGS2  _SA_ARGS1, uint sz1, uint al1
 #define _SA_ARGS1  _SA_ARGS0, uint sz0, uint al0
 #define _SA_ARGS0

 #define SA_ARGS(N) _SA_ARGS##N

 #define _SET_KARG10 _SET_KARG9; lo = align_up(lo, al9); args[6+9] = lo; lo += sz9
 #define _SET_KARG9  _SET_KARG8; lo = align_up(lo, al8); args[6+8] = lo; lo += sz8
 #define _SET_KARG8  _SET_KARG7; lo = align_up(lo, al7); args[6+7] = lo; lo += sz7
 #define _SET_KARG7  _SET_KARG6; lo = align_up(lo, al6); args[6+6] = lo; lo += sz6
 #define _SET_KARG6  _SET_KARG5; lo = align_up(lo, al5); args[6+5] = lo; lo += sz5
 #define _SET_KARG5  _SET_KARG4; lo = align_up(lo, al4); args[6+4] = lo; lo += sz4
 #define _SET_KARG4  _SET_KARG3; lo = align_up(lo, al3); args[6+3] = lo; lo += sz3
 #define _SET_KARG3  _SET_KARG2; lo = align_up(lo, al2); args[6+2] = lo; lo += sz2
 #define _SET_KARG2  _SET_KARG1; lo = align_up(lo, al1); args[6+1] = lo; lo += sz1
 #define _SET_KARG1              lo = align_up(lo, al0); args[6+0] = lo; lo += sz0
 #define _SET_KARG0

 #define SET_KARG(N) _SET_KARG##N

 #define GEN(N) \
 __attribute__((always_inline)) \
 int \
 __enqueue_internal_##N(queue_t q, int flags, ndrange_t ndr_type, \
 	               __global void *fp, __global void *aqlWrap SA_ARGS(N)) \
 { \
     __global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q; \
     __global AmdAqlWrap *me = get_aql_wrap(); \
     __global uint *amask = (__global uint *)vq->aql_slot_mask; \
     __global AmdAqlWrap *aw = (__global AmdAqlWrap *) aqlWrap; \
     int ai = aw - (__global AmdAqlWrap *)(vq + 1); \
     __private NdRange *ndr = (__private NdRange *) &ndr_type; \
  \
     /* Skip check of dim for now */ \
     if (mul24(mul24((uint)ndr->lws[0], (uint)ndr->lws[1]), (uint)ndr->lws[2]) > \
 	CL_DEVICE_MAX_WORK_GROUP_SIZE) { \
 	release_slot(amask, ai); \
 	return CLK_ENQUEUE_FAILURE; \
     } \
  \
     /* This is the current index-based approach, not the ldk based approach */ \
     __global AmdKernelCode **kt = (__global AmdKernelCode **)vq->kernel_table; \
     uint ki = (uint)fp; \
     __global AmdKernelCode *kc = kt[ki]; \
  \
     aw->enqueue_flags = flags; \
  \
     aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
     aw->child_counter = 0; \
     aw->completion = 0; \
     aw->parent_wrap = (ulong)me; \
     aw->wait_num = 0; \
  \
     aw->aql.mix = ((uint)ndr->dim << 16) | (0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0); \
     aw->aql.workgroup_size[0] = (ushort)ndr->lws[0]; \
     aw->aql.workgroup_size[1] = (ushort)ndr->lws[1]; \
     aw->aql.workgroup_size[2] = (ushort)ndr->lws[2]; \
     aw->aql.grid_size[0] = (uint)ndr->gws[0]; \
     aw->aql.grid_size[1] = (uint)ndr->gws[1]; \
     aw->aql.grid_size[2] = (uint)ndr->gws[2]; \
     aw->aql.private_segment_size_bytes = kc->workitem_private_segment_byte_size; \
     aw->aql.group_segment_size_bytes = 0; \
     aw->aql.kernel_object_address = (ulong)kc; \
     aw->aql.completion_signal = 0; \
  \
     /* Set non-capture arguments */ \
     __global size_t *args = (__global size_t *)aw->aql.kernel_arg_address; \
     args[0] = ndr->goff[0]; \
     args[1] = ndr->goff[1]; \
     args[2] = ndr->goff[2]; \
     args[3] = (size_t)get_printf_ptr(); \
     args[4] = (size_t)vq; \
     args[5] = (size_t)aw; \
  \
     uint lo0 = kc->workgroup_group_segment_byte_size; \
     uint lo = lo0; \
     SET_KARG(N); \
     aw->aql.group_segment_size_bytes = lo - lo0; \
  \
     /* Tell the scheduler */ \
     atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
     atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_READY, memory_order_release, memory_scope_device); \
     return 0; \
 }

 GEN(0)
 GEN(1)
 GEN(2)
 GEN(3)
 GEN(4)
 GEN(5)
 GEN(6)
 GEN(7)
 GEN(8)
 GEN(9)
 GEN(10)

 // Now the versions with events

 #define EGEN(N) \
 __attribute__((always_inline)) \
 int \
 __enqueue_internal_##N##_events(queue_t q, int flags, ndrange_t ndr_type, \
 	                        uint nwl, clk_event_t *wl, clk_event_t *re, \
 	                        __global void *fp, __global void *aqlWrap SA_ARGS(N)) \
 { \
     __global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q; \
     __global uint *amask = (__global uint *)vq->aql_slot_mask; \
     __global AmdAqlWrap *aw = (__global AmdAqlWrap *) aqlWrap; \
     int ai = aw - (__global AmdAqlWrap *)(vq + 1); \
      __private NdRange *ndr = (__private NdRange *) &ndr_type; \
  \
     /* Skip check of dim for now */ \
     if (mul24(mul24((uint)ndr->lws[0], (uint)ndr->lws[1]), (uint)ndr->lws[2]) > \
 	CL_DEVICE_MAX_WORK_GROUP_SIZE | nwl > vq->wait_size) { \
         release_slot(amask, ai); \
         return CLK_ENQUEUE_FAILURE; \
     } \
  \
     __global AmdAqlWrap *me = get_aql_wrap(); \
     __global AmdEvent *ev = NULL; \
  \
     if (re != NULL) { \
         /* Get a return event slot */ \
         __global uint *emask = (__global uint *)vq->event_slot_mask; \
         int ei = reserve_slot(emask, vq->event_slot_num); \
         if (ei < 0) { \
             release_slot(amask, ai); \
             return CLK_ENQUEUE_FAILURE; \
 	} \
  \
         /* Initialize return event */ \
         ev = (__global AmdEvent *)vq->event_slots + ei; \
         ev->state = CL_SUBMITTED; \
         ev->counter = 1; \
         ev->timer[0] = 0; \
         ev->timer[1] = 0; \
         ev->timer[2] = 0; \
     } \
  \
     /* This is the current index-based approach, not the ldk based approach */ \
     __global AmdKernelCode **kt = (__global AmdKernelCode **)vq->kernel_table; \
     uint ki = (uint)fp; \
     __global AmdKernelCode *kc = kt[ki]; \
  \
     aw->enqueue_flags = flags; \
  \
     aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
     aw->child_counter = 0; \
     aw->completion = 0; \
     aw->parent_wrap = (ulong)me; \
  \
     aw->aql.mix = ((uint)ndr->dim << 16) | (0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0); \
     aw->aql.workgroup_size[0] = (ushort)ndr->lws[0]; \
     aw->aql.workgroup_size[1] = (ushort)ndr->lws[1]; \
     aw->aql.workgroup_size[2] = (ushort)ndr->lws[2]; \
     aw->aql.grid_size[0] = (uint)ndr->gws[0]; \
     aw->aql.grid_size[1] = (uint)ndr->gws[1]; \
     aw->aql.grid_size[2] = (uint)ndr->gws[2]; \
     aw->aql.private_segment_size_bytes = kc->workitem_private_segment_byte_size; \
     aw->aql.group_segment_size_bytes = 0; \
     aw->aql.kernel_object_address = (ulong)kc; \
     aw->aql.completion_signal = 0; \
  \
     /* Set non-capture arguments */ \
     __global size_t *args = (__global size_t *)aw->aql.kernel_arg_address; \
     args[0] = ndr->goff[0]; \
     args[1] = ndr->goff[1]; \
     args[2] = ndr->goff[2]; \
     args[3] = (size_t)get_printf_ptr(); \
     args[4] = (size_t)vq; \
     args[5] = (size_t)aw; \
  \
     uint lo0 = kc->workgroup_group_segment_byte_size; \
     uint lo = lo0; \
     SET_KARG(N); \
     aw->aql.group_segment_size_bytes = lo - lo0; \
  \
     /* Copy wait list */ \
     if (nwl > 0) \
         copy_waitlist((__global AmdEvent **)aw->wait_list, (__global AmdEvent **)wl, nwl); \
  \
     aw->wait_num = nwl; \
  \
     /* Tell the scheduler */ \
     atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
     atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_MARKER, memory_order_release, memory_scope_device); \
  \
     if (re != NULL) \
         *re = (clk_event_t)ev; \
  \
     return 0; \
 }

 EGEN(0)
 EGEN(1)
 EGEN(2)
 EGEN(3)
 EGEN(4)
 EGEN(5)
 EGEN(6)
 EGEN(7)
 EGEN(8)
 EGEN(9)
 EGEN(10)

 #endif
	/*
	* Copyright (c) 2014 Advanced Micro Devices, Inc.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*/

	#if __OPENCL_C_VERSION__ >= 200

	#include "devenq.h"

	static inline void
	copy_waitlist(__global AmdEvent dst, __global AmdEvent src, uint n)
	{
	uint i;
	for (i=0; i<n; ++i)
	dst[i] = src[i];
	}

	#ifdef __clang__
	__attribute__((overloadable))
	#endif
	__attribute__((always_inline)) queue_t
	get_default_queue(void)
	{
	return (queue_t)get_vqueue();
	}

	#ifdef __clang__
	__attribute__((overloadable))
	#endif
	__attribute__((always_inline)) int
	enqueue_marker(queue_t q, uint nwl, const clk_event_t wl, clk_event_t re)
	{
	__global AmdVQueueHeader vq = (__global AmdVQueueHeader )q;
	if (nwl > vq->wait_size)
	return CLK_ENQUEUE_FAILURE;

	// Get a wrap slot
	__global uint amask = (__global uint )vq->aql_slot_mask;
	int ai = reserve_slot(amask, vq->aql_slot_num);
	if (ai < 0)
	return CLK_ENQUEUE_FAILURE;

	// Get a return event slot
	__global uint emask = (__global uint )vq->event_slot_mask;
	int ei = reserve_slot(emask, vq->event_slot_num);
	if (ei < 0) {
	release_slot(amask, ai);
	return CLK_ENQUEUE_FAILURE;
	}

	// Initialize return event
	__global AmdEvent ev = (__global AmdEvent )vq->event_slots + ei;
	ev->state = CL_SUBMITTED;
	ev->counter = 1;
	ev->timer[0] = 0;
	ev->timer[1] = 0;
	ev->timer[2] = 0;

	// Initialize wrap
	__global AmdAqlWrap *me = get_aql_wrap();
	__global AmdAqlWrap aw = (__global AmdAqlWrap )(vq + 1) + ai;

	aw->enqueue_flags = CLK_ENQUEUE_FLAGS_NO_WAIT;
	aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device);
	aw->child_counter = 0;
	aw->completion = (ulong)ev;
	aw->parent_wrap = (ulong)me;

	if (nwl > 0)
	copy_waitlist((__global AmdEvent )aw->wait_list, (__global AmdEvent )wl, nwl);

	aw->wait_num = nwl;

	// A marker is never enqueued so ignore displatch packet

	// Tell the scheduler
	atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device);
	atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_MARKER, memory_order_release, memory_scope_device);

	*re = (clk_event_t)ev;
	return 0;
	}

	// int
	// __enqueue_internal_{0,1,.,10}[_events] (
	// queue_t q,
	// int flags,
	// int dims, size_t goff[3], size_t gsize[3], size_t lsize[3],
	// __global void * something_like_function_pointer,
	// __global void * wrap_ptr_from_prep
	// [, uint size0, uint align0
	// [, uint size1, uint align1
	// [, uint size2, uint align2
	// [, uint size3, uint align3
	// ...]]]]]] );

	// Help with size and alignment handling
	#define _SA_ARGS10 _SA_ARGS9, uint sz9, uint al9
	#define _SA_ARGS9 _SA_ARGS8, uint sz8, uint al8
	#define _SA_ARGS8 _SA_ARGS7, uint sz7, uint al7
	#define _SA_ARGS7 _SA_ARGS6, uint sz6, uint al6
	#define _SA_ARGS6 _SA_ARGS5, uint sz5, uint al5
	#define _SA_ARGS5 _SA_ARGS4, uint sz4, uint al4
	#define _SA_ARGS4 _SA_ARGS3, uint sz3, uint al3
	#define _SA_ARGS3 _SA_ARGS2, uint sz2, uint al2
	#define _SA_ARGS2 _SA_ARGS1, uint sz1, uint al1
	#define _SA_ARGS1 _SA_ARGS0, uint sz0, uint al0
	#define _SA_ARGS0

	#define SA_ARGS(N) _SA_ARGS##N

	#define _SET_KARG10 _SET_KARG9; lo = align_up(lo, al9); args[6+9] = lo; lo += sz9
	#define _SET_KARG9 _SET_KARG8; lo = align_up(lo, al8); args[6+8] = lo; lo += sz8
	#define _SET_KARG8 _SET_KARG7; lo = align_up(lo, al7); args[6+7] = lo; lo += sz7
	#define _SET_KARG7 _SET_KARG6; lo = align_up(lo, al6); args[6+6] = lo; lo += sz6
	#define _SET_KARG6 _SET_KARG5; lo = align_up(lo, al5); args[6+5] = lo; lo += sz5
	#define _SET_KARG5 _SET_KARG4; lo = align_up(lo, al4); args[6+4] = lo; lo += sz4
	#define _SET_KARG4 _SET_KARG3; lo = align_up(lo, al3); args[6+3] = lo; lo += sz3
	#define _SET_KARG3 _SET_KARG2; lo = align_up(lo, al2); args[6+2] = lo; lo += sz2
	#define _SET_KARG2 _SET_KARG1; lo = align_up(lo, al1); args[6+1] = lo; lo += sz1
	#define _SET_KARG1 lo = align_up(lo, al0); args[6+0] = lo; lo += sz0
	#define _SET_KARG0

	#define SET_KARG(N) _SET_KARG##N

	#define GEN(N) \
	__attribute__((always_inline)) \
	int \
	__enqueue_internal_##N(queue_t q, int flags, ndrange_t ndr_type, \
	__global void fp, __global void aqlWrap SA_ARGS(N)) \
	{ \
	__global AmdVQueueHeader vq = (__global AmdVQueueHeader )q; \
	__global AmdAqlWrap *me = get_aql_wrap(); \
	__global uint amask = (__global uint )vq->aql_slot_mask; \
	__global AmdAqlWrap aw = (__global AmdAqlWrap ) aqlWrap; \
	int ai = aw - (__global AmdAqlWrap *)(vq + 1); \
	__private NdRange ndr = (__private NdRange ) &ndr_type; \
	\
	/* Skip check of dim for now */ \
	if (mul24(mul24((uint)ndr->lws[0], (uint)ndr->lws[1]), (uint)ndr->lws[2]) > \
	CL_DEVICE_MAX_WORK_GROUP_SIZE) { \
	release_slot(amask, ai); \
	return CLK_ENQUEUE_FAILURE; \
	} \
	\
	/* This is the current index-based approach, not the ldk based approach */ \
	__global AmdKernelCode kt = (__global AmdKernelCode )vq->kernel_table; \
	uint ki = (uint)fp; \
	__global AmdKernelCode *kc = kt[ki]; \
	\
	aw->enqueue_flags = flags; \
	\
	aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
	aw->child_counter = 0; \
	aw->completion = 0; \
	aw->parent_wrap = (ulong)me; \
	aw->wait_num = 0; \
	\
	aw->aql.mix = ((uint)ndr->dim << 16) \| (0x1 << 11) \| (0x1 << 9) \|(0x0 << 8) \| (0x2 << 0); \
	aw->aql.workgroup_size[0] = (ushort)ndr->lws[0]; \
	aw->aql.workgroup_size[1] = (ushort)ndr->lws[1]; \
	aw->aql.workgroup_size[2] = (ushort)ndr->lws[2]; \
	aw->aql.grid_size[0] = (uint)ndr->gws[0]; \
	aw->aql.grid_size[1] = (uint)ndr->gws[1]; \
	aw->aql.grid_size[2] = (uint)ndr->gws[2]; \
	aw->aql.private_segment_size_bytes = kc->workitem_private_segment_byte_size; \
	aw->aql.group_segment_size_bytes = 0; \
	aw->aql.kernel_object_address = (ulong)kc; \
	aw->aql.completion_signal = 0; \
	\
	/* Set non-capture arguments */ \
	__global size_t args = (__global size_t )aw->aql.kernel_arg_address; \
	args[0] = ndr->goff[0]; \
	args[1] = ndr->goff[1]; \
	args[2] = ndr->goff[2]; \
	args[3] = (size_t)get_printf_ptr(); \
	args[4] = (size_t)vq; \
	args[5] = (size_t)aw; \
	\
	uint lo0 = kc->workgroup_group_segment_byte_size; \
	uint lo = lo0; \
	SET_KARG(N); \
	aw->aql.group_segment_size_bytes = lo - lo0; \
	\
	/* Tell the scheduler */ \
	atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
	atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_READY, memory_order_release, memory_scope_device); \
	return 0; \
	}

	GEN(0)
	GEN(1)
	GEN(2)
	GEN(3)
	GEN(4)
	GEN(5)
	GEN(6)
	GEN(7)
	GEN(8)
	GEN(9)
	GEN(10)

	// Now the versions with events

	#define EGEN(N) \
	__attribute__((always_inline)) \
	int \
	__enqueue_internal_##N##_events(queue_t q, int flags, ndrange_t ndr_type, \
	uint nwl, clk_event_t wl, clk_event_t re, \
	__global void fp, __global void aqlWrap SA_ARGS(N)) \
	{ \
	__global AmdVQueueHeader vq = (__global AmdVQueueHeader )q; \
	__global uint amask = (__global uint )vq->aql_slot_mask; \
	__global AmdAqlWrap aw = (__global AmdAqlWrap ) aqlWrap; \
	int ai = aw - (__global AmdAqlWrap *)(vq + 1); \
	__private NdRange ndr = (__private NdRange ) &ndr_type; \
	\
	/* Skip check of dim for now */ \
	if (mul24(mul24((uint)ndr->lws[0], (uint)ndr->lws[1]), (uint)ndr->lws[2]) > \
	CL_DEVICE_MAX_WORK_GROUP_SIZE \| nwl > vq->wait_size) { \
	release_slot(amask, ai); \
	return CLK_ENQUEUE_FAILURE; \
	} \
	\
	__global AmdAqlWrap *me = get_aql_wrap(); \
	__global AmdEvent *ev = NULL; \
	\
	if (re != NULL) { \
	/* Get a return event slot */ \
	__global uint emask = (__global uint )vq->event_slot_mask; \
	int ei = reserve_slot(emask, vq->event_slot_num); \
	if (ei < 0) { \
	release_slot(amask, ai); \
	return CLK_ENQUEUE_FAILURE; \
	} \
	\
	/* Initialize return event */ \
	ev = (__global AmdEvent *)vq->event_slots + ei; \
	ev->state = CL_SUBMITTED; \
	ev->counter = 1; \
	ev->timer[0] = 0; \
	ev->timer[1] = 0; \
	ev->timer[2] = 0; \
	} \
	\
	/* This is the current index-based approach, not the ldk based approach */ \
	__global AmdKernelCode kt = (__global AmdKernelCode )vq->kernel_table; \
	uint ki = (uint)fp; \
	__global AmdKernelCode *kc = kt[ki]; \
	\
	aw->enqueue_flags = flags; \
	\
	aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
	aw->child_counter = 0; \
	aw->completion = 0; \
	aw->parent_wrap = (ulong)me; \
	\
	aw->aql.mix = ((uint)ndr->dim << 16) \| (0x1 << 11) \| (0x1 << 9) \|(0x0 << 8) \| (0x2 << 0); \
	aw->aql.workgroup_size[0] = (ushort)ndr->lws[0]; \
	aw->aql.workgroup_size[1] = (ushort)ndr->lws[1]; \
	aw->aql.workgroup_size[2] = (ushort)ndr->lws[2]; \
	aw->aql.grid_size[0] = (uint)ndr->gws[0]; \
	aw->aql.grid_size[1] = (uint)ndr->gws[1]; \
	aw->aql.grid_size[2] = (uint)ndr->gws[2]; \
	aw->aql.private_segment_size_bytes = kc->workitem_private_segment_byte_size; \
	aw->aql.group_segment_size_bytes = 0; \
	aw->aql.kernel_object_address = (ulong)kc; \
	aw->aql.completion_signal = 0; \
	\
	/* Set non-capture arguments */ \
	__global size_t args = (__global size_t )aw->aql.kernel_arg_address; \
	args[0] = ndr->goff[0]; \
	args[1] = ndr->goff[1]; \
	args[2] = ndr->goff[2]; \
	args[3] = (size_t)get_printf_ptr(); \
	args[4] = (size_t)vq; \
	args[5] = (size_t)aw; \
	\
	uint lo0 = kc->workgroup_group_segment_byte_size; \
	uint lo = lo0; \
	SET_KARG(N); \
	aw->aql.group_segment_size_bytes = lo - lo0; \
	\
	/* Copy wait list */ \
	if (nwl > 0) \
	copy_waitlist((__global AmdEvent )aw->wait_list, (__global AmdEvent )wl, nwl); \
	\
	aw->wait_num = nwl; \
	\
	/* Tell the scheduler */ \
	atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
	atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_MARKER, memory_order_release, memory_scope_device); \
	\
	if (re != NULL) \
	*re = (clk_event_t)ev; \
	\
	return 0; \
	}

	EGEN(0)
	EGEN(1)
	EGEN(2)
	EGEN(3)
	EGEN(4)
	EGEN(5)
	EGEN(6)
	EGEN(7)
	EGEN(8)
	EGEN(9)
	EGEN(10)

	#endif