blob: c94487445c3cf6863f7f4ca034ec73e6430d8467 [file] [log] [blame]
/*
* Copyright (c) 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#if __OPENCL_C_VERSION__ >= 200
#include "devenq.h"
static inline void
copy_waitlist(__global AmdEvent **dst, __global AmdEvent **src, uint n)
{
uint i;
for (i=0; i<n; ++i)
dst[i] = src[i];
}
#ifdef __clang__
__attribute__((overloadable))
#endif
__attribute__((always_inline)) queue_t
get_default_queue(void)
{
return (queue_t)get_vqueue();
}
#ifdef __clang__
__attribute__((overloadable))
#endif
__attribute__((always_inline)) int
enqueue_marker(queue_t q, uint nwl, const clk_event_t *wl, clk_event_t *re)
{
__global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q;
if (nwl > vq->wait_size)
return CLK_ENQUEUE_FAILURE;
// Get a wrap slot
__global uint *amask = (__global uint *)vq->aql_slot_mask;
int ai = reserve_slot(amask, vq->aql_slot_num);
if (ai < 0)
return CLK_ENQUEUE_FAILURE;
// Get a return event slot
__global uint *emask = (__global uint *)vq->event_slot_mask;
int ei = reserve_slot(emask, vq->event_slot_num);
if (ei < 0) {
release_slot(amask, ai);
return CLK_ENQUEUE_FAILURE;
}
// Initialize return event
__global AmdEvent *ev = (__global AmdEvent *)vq->event_slots + ei;
ev->state = CL_SUBMITTED;
ev->counter = 1;
ev->timer[0] = 0;
ev->timer[1] = 0;
ev->timer[2] = 0;
// Initialize wrap
__global AmdAqlWrap *me = get_aql_wrap();
__global AmdAqlWrap *aw = (__global AmdAqlWrap *)(vq + 1) + ai;
aw->enqueue_flags = CLK_ENQUEUE_FLAGS_NO_WAIT;
aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device);
aw->child_counter = 0;
aw->completion = (ulong)ev;
aw->parent_wrap = (ulong)me;
if (nwl > 0)
copy_waitlist((__global AmdEvent **)aw->wait_list, (__global AmdEvent **)wl, nwl);
aw->wait_num = nwl;
// A marker is never enqueued so ignore displatch packet
// Tell the scheduler
atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device);
atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_MARKER, memory_order_release, memory_scope_device);
*re = (clk_event_t)ev;
return 0;
}
// int
// __enqueue_internal_{0,1,.,10}[_events] (
// queue_t q,
// int flags,
// int dims, size_t goff[3], size_t gsize[3], size_t lsize[3],
// __global void * something_like_function_pointer,
// __global void * wrap_ptr_from_prep
// [, uint size0, uint align0
// [, uint size1, uint align1
// [, uint size2, uint align2
// [, uint size3, uint align3
// ...]]]]]] );
// Help with size and alignment handling
#define _SA_ARGS10 _SA_ARGS9, uint sz9, uint al9
#define _SA_ARGS9 _SA_ARGS8, uint sz8, uint al8
#define _SA_ARGS8 _SA_ARGS7, uint sz7, uint al7
#define _SA_ARGS7 _SA_ARGS6, uint sz6, uint al6
#define _SA_ARGS6 _SA_ARGS5, uint sz5, uint al5
#define _SA_ARGS5 _SA_ARGS4, uint sz4, uint al4
#define _SA_ARGS4 _SA_ARGS3, uint sz3, uint al3
#define _SA_ARGS3 _SA_ARGS2, uint sz2, uint al2
#define _SA_ARGS2 _SA_ARGS1, uint sz1, uint al1
#define _SA_ARGS1 _SA_ARGS0, uint sz0, uint al0
#define _SA_ARGS0
#define SA_ARGS(N) _SA_ARGS##N
#define _SET_KARG10 _SET_KARG9; lo = align_up(lo, al9); args[6+9] = lo; lo += sz9
#define _SET_KARG9 _SET_KARG8; lo = align_up(lo, al8); args[6+8] = lo; lo += sz8
#define _SET_KARG8 _SET_KARG7; lo = align_up(lo, al7); args[6+7] = lo; lo += sz7
#define _SET_KARG7 _SET_KARG6; lo = align_up(lo, al6); args[6+6] = lo; lo += sz6
#define _SET_KARG6 _SET_KARG5; lo = align_up(lo, al5); args[6+5] = lo; lo += sz5
#define _SET_KARG5 _SET_KARG4; lo = align_up(lo, al4); args[6+4] = lo; lo += sz4
#define _SET_KARG4 _SET_KARG3; lo = align_up(lo, al3); args[6+3] = lo; lo += sz3
#define _SET_KARG3 _SET_KARG2; lo = align_up(lo, al2); args[6+2] = lo; lo += sz2
#define _SET_KARG2 _SET_KARG1; lo = align_up(lo, al1); args[6+1] = lo; lo += sz1
#define _SET_KARG1 lo = align_up(lo, al0); args[6+0] = lo; lo += sz0
#define _SET_KARG0
#define SET_KARG(N) _SET_KARG##N
#define GEN(N) \
__attribute__((always_inline)) \
int \
__enqueue_internal_##N(queue_t q, int flags, ndrange_t ndr_type, \
__global void *fp, __global void *aqlWrap SA_ARGS(N)) \
{ \
__global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q; \
__global AmdAqlWrap *me = get_aql_wrap(); \
__global uint *amask = (__global uint *)vq->aql_slot_mask; \
__global AmdAqlWrap *aw = (__global AmdAqlWrap *) aqlWrap; \
int ai = aw - (__global AmdAqlWrap *)(vq + 1); \
__private NdRange *ndr = (__private NdRange *) &ndr_type; \
\
/* Skip check of dim for now */ \
if (mul24(mul24((uint)ndr->lws[0], (uint)ndr->lws[1]), (uint)ndr->lws[2]) > \
CL_DEVICE_MAX_WORK_GROUP_SIZE) { \
release_slot(amask, ai); \
return CLK_ENQUEUE_FAILURE; \
} \
\
/* This is the current index-based approach, not the ldk based approach */ \
__global AmdKernelCode **kt = (__global AmdKernelCode **)vq->kernel_table; \
uint ki = (uint)fp; \
__global AmdKernelCode *kc = kt[ki]; \
\
aw->enqueue_flags = flags; \
\
aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
aw->child_counter = 0; \
aw->completion = 0; \
aw->parent_wrap = (ulong)me; \
aw->wait_num = 0; \
\
aw->aql.mix = ((uint)ndr->dim << 16) | (0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0); \
aw->aql.workgroup_size[0] = (ushort)ndr->lws[0]; \
aw->aql.workgroup_size[1] = (ushort)ndr->lws[1]; \
aw->aql.workgroup_size[2] = (ushort)ndr->lws[2]; \
aw->aql.grid_size[0] = (uint)ndr->gws[0]; \
aw->aql.grid_size[1] = (uint)ndr->gws[1]; \
aw->aql.grid_size[2] = (uint)ndr->gws[2]; \
aw->aql.private_segment_size_bytes = kc->workitem_private_segment_byte_size; \
aw->aql.group_segment_size_bytes = 0; \
aw->aql.kernel_object_address = (ulong)kc; \
aw->aql.completion_signal = 0; \
\
/* Set non-capture arguments */ \
__global size_t *args = (__global size_t *)aw->aql.kernel_arg_address; \
args[0] = ndr->goff[0]; \
args[1] = ndr->goff[1]; \
args[2] = ndr->goff[2]; \
args[3] = (size_t)get_printf_ptr(); \
args[4] = (size_t)vq; \
args[5] = (size_t)aw; \
\
uint lo0 = kc->workgroup_group_segment_byte_size; \
uint lo = lo0; \
SET_KARG(N); \
aw->aql.group_segment_size_bytes = lo - lo0; \
\
/* Tell the scheduler */ \
atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_READY, memory_order_release, memory_scope_device); \
return 0; \
}
GEN(0)
GEN(1)
GEN(2)
GEN(3)
GEN(4)
GEN(5)
GEN(6)
GEN(7)
GEN(8)
GEN(9)
GEN(10)
// Now the versions with events
#define EGEN(N) \
__attribute__((always_inline)) \
int \
__enqueue_internal_##N##_events(queue_t q, int flags, ndrange_t ndr_type, \
uint nwl, clk_event_t *wl, clk_event_t *re, \
__global void *fp, __global void *aqlWrap SA_ARGS(N)) \
{ \
__global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q; \
__global uint *amask = (__global uint *)vq->aql_slot_mask; \
__global AmdAqlWrap *aw = (__global AmdAqlWrap *) aqlWrap; \
int ai = aw - (__global AmdAqlWrap *)(vq + 1); \
__private NdRange *ndr = (__private NdRange *) &ndr_type; \
\
/* Skip check of dim for now */ \
if (mul24(mul24((uint)ndr->lws[0], (uint)ndr->lws[1]), (uint)ndr->lws[2]) > \
CL_DEVICE_MAX_WORK_GROUP_SIZE | nwl > vq->wait_size) { \
release_slot(amask, ai); \
return CLK_ENQUEUE_FAILURE; \
} \
\
__global AmdAqlWrap *me = get_aql_wrap(); \
__global AmdEvent *ev = NULL; \
\
if (re != NULL) { \
/* Get a return event slot */ \
__global uint *emask = (__global uint *)vq->event_slot_mask; \
int ei = reserve_slot(emask, vq->event_slot_num); \
if (ei < 0) { \
release_slot(amask, ai); \
return CLK_ENQUEUE_FAILURE; \
} \
\
/* Initialize return event */ \
ev = (__global AmdEvent *)vq->event_slots + ei; \
ev->state = CL_SUBMITTED; \
ev->counter = 1; \
ev->timer[0] = 0; \
ev->timer[1] = 0; \
ev->timer[2] = 0; \
} \
\
/* This is the current index-based approach, not the ldk based approach */ \
__global AmdKernelCode **kt = (__global AmdKernelCode **)vq->kernel_table; \
uint ki = (uint)fp; \
__global AmdKernelCode *kc = kt[ki]; \
\
aw->enqueue_flags = flags; \
\
aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
aw->child_counter = 0; \
aw->completion = 0; \
aw->parent_wrap = (ulong)me; \
\
aw->aql.mix = ((uint)ndr->dim << 16) | (0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0); \
aw->aql.workgroup_size[0] = (ushort)ndr->lws[0]; \
aw->aql.workgroup_size[1] = (ushort)ndr->lws[1]; \
aw->aql.workgroup_size[2] = (ushort)ndr->lws[2]; \
aw->aql.grid_size[0] = (uint)ndr->gws[0]; \
aw->aql.grid_size[1] = (uint)ndr->gws[1]; \
aw->aql.grid_size[2] = (uint)ndr->gws[2]; \
aw->aql.private_segment_size_bytes = kc->workitem_private_segment_byte_size; \
aw->aql.group_segment_size_bytes = 0; \
aw->aql.kernel_object_address = (ulong)kc; \
aw->aql.completion_signal = 0; \
\
/* Set non-capture arguments */ \
__global size_t *args = (__global size_t *)aw->aql.kernel_arg_address; \
args[0] = ndr->goff[0]; \
args[1] = ndr->goff[1]; \
args[2] = ndr->goff[2]; \
args[3] = (size_t)get_printf_ptr(); \
args[4] = (size_t)vq; \
args[5] = (size_t)aw; \
\
uint lo0 = kc->workgroup_group_segment_byte_size; \
uint lo = lo0; \
SET_KARG(N); \
aw->aql.group_segment_size_bytes = lo - lo0; \
\
/* Copy wait list */ \
if (nwl > 0) \
copy_waitlist((__global AmdEvent **)aw->wait_list, (__global AmdEvent **)wl, nwl); \
\
aw->wait_num = nwl; \
\
/* Tell the scheduler */ \
atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_MARKER, memory_order_release, memory_scope_device); \
\
if (re != NULL) \
*re = (clk_event_t)ev; \
\
return 0; \
}
EGEN(0)
EGEN(1)
EGEN(2)
EGEN(3)
EGEN(4)
EGEN(5)
EGEN(6)
EGEN(7)
EGEN(8)
EGEN(9)
EGEN(10)
#endif