| /* |
| * Copyright (c) 2014 Advanced Micro Devices, Inc. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| * copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in |
| * all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| * THE SOFTWARE. |
| */ |
| |
| #pragma OPENCL EXTENSION cl_amd_media_ops2 : enable |
| #define CLK_ENQUEUE_FAILURE -1 |
| |
| // XXX This was copied from runtime/device/gpu/gpuschedcl.cpp |
| |
| //! AmdAqlWrap slot state |
| enum AqlWrapState { |
| AQL_WRAP_FREE = 0, |
| AQL_WRAP_RESERVED, |
| AQL_WRAP_READY, |
| AQL_WRAP_MARKER, |
| AQL_WRAP_BUSY, |
| AQL_WRAP_DONE |
| }; |
| |
| //! Profiling states |
| enum ProfilingState { |
| PROFILING_COMMAND_START = 0, |
| PROFILING_COMMAND_END, |
| PROFILING_COMMAND_COMPLETE |
| }; |
| |
| //! OCL dispatch condition flags |
| // --- this is unused in the library and I've asked German to remove |
| // it in favor of the clang enum |
| enum ClFlags { |
| NO_WAIT = 0, |
| WAIT_PARENT, |
| WAIT_WORK_GROUP |
| }; |
| |
| typedef struct _HsaAqlDispatchPacket { |
| uint mix; |
| ushort workgroup_size[3]; |
| ushort reserved2; |
| uint grid_size[3]; |
| uint private_segment_size_bytes; |
| uint group_segment_size_bytes; |
| ulong kernel_object_address; |
| ulong kernel_arg_address; |
| ulong reserved3; |
| ulong completion_signal; |
| } HsaAqlDispatchPacket; |
| |
| typedef struct _AmdVQueueHeader { |
| uint aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64). |
| uint event_slot_num; //!< [LRO] The number of kernel events in the events buffer |
| ulong event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events |
| ulong event_slots; //!< [LRO] Pointer to a buffer for the events. |
| // Array of event_slot_num entries of AmdEvent |
| ulong aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots |
| uint command_counter; //!< [LRW] The global counter for the submitted commands into the queue |
| uint wait_size; //!< [LRO] The wait list size (in clk_event_t) |
| uint arg_size; //!< [LRO] The size of argument buffer (in bytes) |
| uint reserved0; //!< For the future usage |
| ulong kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry) |
| uint reserved[2]; //!< For the future usage |
| } AmdVQueueHeader; |
| |
| typedef struct _AmdAqlWrap { |
| uint state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY, |
| // MARKER, BUSY and DONE. The block could be returned back to a free state. |
| uint enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start - |
| // (KERNEL_ENQUEUE_FLAGS_T) |
| // NO_WAIT - we just start processing |
| // WAIT_PARENT - check if parent_wrap->state is done and then start processing |
| // WAIT_WORK_GROUP currently == WAIT_PARENT |
| uint command_id; //!< [LWO/SRO] The unique command ID |
| uint child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels. |
| // It's incremented on the |
| // start and decremented on the finish. The parent kernel can be considered as |
| // done when the value is 0 and the state is DONE |
| ulong completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t) |
| ulong parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*) |
| ulong wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) |
| uint wait_num; //!< [LWO/SRO] The number of cl_event_wait objects |
| uint reserved[5]; //!< For the future usage |
| HsaAqlDispatchPacket aql; //!< [LWO/SRO] AQL packet - 64 bytes AQL packet |
| } AmdAqlWrap; |
| |
| typedef struct _AmdEvent { |
| uint state; //!< [LRO/SRW] Event state: START, END, COMPLETE |
| uint counter; //!< [LRW] Event retain/release counter. 0 means the event is free |
| ulong timer[3]; //!< [LRO/SWO] Timer values for profiling for each state |
| } AmdEvent; |
| |
| // XXX This is adapted from hsa.h |
| |
| // This is an OpenCLized hsa_control_directives_t |
| typedef struct _HsaControlDirectives { |
| ulong enabled_control_directives; |
| ushort enable_break_exceptions; |
| ushort enable_detect_exceptions; |
| uint max_dynamic_group_size; |
| uint max_flat_grid_size; |
| uint max_flat_workgroup_size; |
| uint requested_workgroups_per_cu; |
| uint required_grid_size[3]; |
| uint required_workgroup_size[3]; |
| uchar required_dim; |
| uchar reserved[75]; |
| } HsaControlDirectives; |
| |
| // This is an OpenCLized amd_kernel_code_t |
| typedef struct _AmdKernelCode { |
| uint amd_code_version_major; |
| uint amd_code_version_minor; |
| uint struct_byte_size; |
| uint target_chip; |
| ulong kernel_code_entry_byte_offset; |
| ulong kernel_code_prefetch_byte_offset; |
| ulong kernel_code_prefetch_byte_size; |
| ulong max_scratch_backing_memory_byte_size; |
| ulong compute_pgm_resource_registers; |
| uint enables_and_flags; |
| uint gds_segment_byte_size; |
| ushort debug_wavefront_private_segment_offset_sgpr; |
| ushort debug_private_segment_buffer_sgpr; |
| ushort wavefront_sgpr_count; |
| ushort workitem_vgpr_count; |
| ulong kernarg_segment_byte_size; |
| uint workitem_private_segment_byte_size; |
| uint workgroup_group_segment_byte_size; |
| uint workgroup_fbarrier_count; |
| uchar kernarg_segment_alignment; |
| uchar group_segment_alignment; |
| uchar private_segment_alignment; |
| uchar code_alignment; |
| uint code_type; |
| uint code_properties; |
| uchar wavefront_size; |
| uchar optimization_level; |
| uchar hsail_profile; |
| uchar hsail_machine_model; |
| uint hsail_version_major; |
| uint hsail_version_minor; |
| ushort hsail_target_options; |
| ushort reserved3; |
| HsaControlDirectives control_directive; |
| } AmdKernelCode; |
| |
| |
| // Library only from here |
| |
| // XXX this needs to match workgroup/wg.h MAX_WAVES_PER_SIMD |
| #define CL_DEVICE_MAX_WORK_GROUP_SIZE 256 |
| |
| // ABI has 6 special leading arguments: |
| // global_offset[3], printf_buf, default vqueue pointer, and self AqlWrap pointer |
| #define NUM_SPECIAL_ARGS 6 |
| extern __attribute__((const)) uint __hsail_ld_kernarg_u32(uint); |
| extern __attribute__((const)) ulong __hsail_ld_kernarg_u64(uint); |
| |
| static inline __global AmdVQueueHeader * |
| get_vqueue(void) |
| { |
| size_t vq; |
| |
| if (sizeof(size_t) == 4) |
| vq = __hsail_ld_kernarg_u32(4*4); |
| else |
| vq = __hsail_ld_kernarg_u64(4*8); |
| |
| return (__global AmdVQueueHeader *)vq; |
| } |
| |
| static inline __global AmdAqlWrap * |
| get_aql_wrap(void) |
| { |
| size_t aw; |
| |
| if (sizeof(size_t) == 4) |
| aw = __hsail_ld_kernarg_u32(5*4); |
| else |
| aw = __hsail_ld_kernarg_u64(5*8); |
| |
| return (__global AmdAqlWrap *)aw; |
| } |
| |
| static inline __global void * |
| get_printf_ptr(void) |
| { |
| size_t pb; |
| |
| if (sizeof(size_t) == 4) |
| pb = __hsail_ld_kernarg_u32(3*4); |
| else |
| pb = __hsail_ld_kernarg_u64(3*8); |
| |
| return (__global void *)pb; |
| } |
| |
| typedef struct _NdRange { |
| uint dim; |
| size_t goff[3]; |
| size_t gws[3]; |
| size_t lws[3]; |
| } NdRange; |
| |
| // reserve a slot in a bitmask controlled resource |
| // n is the number of slots |
| static inline int |
| reserve_slot(__global uint * restrict mask, uint n) |
| { |
| n >>= 5; |
| uint i, j, k, v, vv, z; |
| |
| /* Spread the starting points */ |
| i = get_sub_group_local_id() % n; |
| |
| /* Allow only one pass */ |
| for (j=0,k=i;j<n;++j) { |
| __global atomic_uint *p = (__global atomic_uint *)(mask + k); |
| v = atomic_load_explicit(p, memory_order_acquire, memory_scope_device); |
| for (;;) { |
| z = ctz(~v); |
| if (z == 32U) |
| break; |
| vv = v | (1U << z); |
| if (atomic_compare_exchange_strong_explicit(p, &v, vv, memory_order_acq_rel, memory_order_acquire, memory_scope_device)) |
| break; |
| } |
| if (z < 32U) |
| break; |
| k = k == n-1 ? 0 : k+1; |
| } |
| |
| k = (k << 5) + z; |
| return z < 32U ? (int)k : -1; |
| } |
| |
| // release slot in a bitmask controlled resource |
| // i is the slot number |
| static inline void |
| release_slot(__global uint * restrict mask, uint i) |
| { |
| /* uint b = ~(1UL << (i & 0x1f)); */ |
| uint b = ~amd_bfm(1U, i); |
| __global atomic_uint *p = (__global atomic_uint *)(mask + (i >> 5)); |
| uint v, vv; |
| |
| v = atomic_load_explicit(p, memory_order_acquire, memory_scope_device); |
| for (;;) { |
| vv = v & b; |
| if (atomic_compare_exchange_strong_explicit(p, &v, vv, memory_order_acq_rel, memory_order_acquire, memory_scope_device)) |
| break; |
| } |
| } |
| |
| static inline uint |
| align_up(uint start, uint align) |
| { |
| return (start + align - 1U) & -align; |
| } |
| |