Add AMD OpenCL builtins git-svn-id: https://llvm.org/svn/llvm-project/libclc/branches/amd-builtins@219217 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/amd-builtins/README b/amd-builtins/README new file mode 100644 index 0000000..630419a --- /dev/null +++ b/amd-builtins/README
@@ -0,0 +1,9 @@ +This directory contains builtins from AMD's OpenCL builtin library. +There is an ongoing effort to port these builtins to libclc. If you +would like to port a function, review the libclc-dev@pcc.me.uk archives +to make sure no one is already working on it. If no one else has started +this port, then send an email to libclc-dev@pcc.me.uk with the subject + +Porting: builtin_name + +This way we don't have multiple people trying to port the same functions.
diff --git a/amd-builtins/conv/conversions.cl b/amd-builtins/conv/conversions.cl new file mode 100644 index 0000000..567ba70 --- /dev/null +++ b/amd-builtins/conv/conversions.cl
@@ -0,0 +1,211 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// #pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#define ConvIntrinPrototype(FromTy,FromSuf,ToTy,ToSuf,Rnd) \ +extern __attribute__((pure)) \ +ToTy __cvt_##ToSuf##_##Rnd##_##FromSuf(FromTy); + +#define ConvIntrinPrototypeSet(FromTy,FromSuf,ToTy,ToSuf) \ + ConvIntrinPrototype(FromTy,FromSuf,ToTy,ToSuf,rte) \ + ConvIntrinPrototype(FromTy,FromSuf,ToTy,ToSuf,rtn) \ + ConvIntrinPrototype(FromTy,FromSuf,ToTy,ToSuf,rtp) \ + ConvIntrinPrototype(FromTy,FromSuf,ToTy,ToSuf,rtz) + +#define FloatToIntegerRoundingConv(FromTy,FromSuf,ToTy,ToSuf,Rnd) \ +__attribute__((always_inline)) ToTy \ +__convert_##ToTy##_##Rnd##_##FromSuf(FromTy x) { \ + return (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(x); \ +} + +#define FloatToIntegerSatRoundingConv(FromTy,FromSuf,ToTy,ToSuf,Rnd,Min,Max) \ +__attribute__((always_inline)) ToTy \ +__convert_##ToTy##_sat_##Rnd##_##FromSuf(FromTy x) { \ + ToTy r; \ + if (sizeof(ToTy) >= sizeof(long)) { \ + r = (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(x); \ + bool le = (x <= (FromTy)Min); \ + bool ge = (x >= (FromTy)Max); \ + r = le?((ToTy) Min):r; \ + r = ge?((ToTy) Max):r; \ + } else { \ + FromTy s = min(max(x, (FromTy)Min), (FromTy)Max); \ + r = (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(s); \ + } \ + return r; \ +} + +#define FloatToUnsignedSatRoundingConv(FromTy,FromSuf,ToTy,ToSuf,Rnd,Max) \ +__attribute__((always_inline)) ToTy \ +__convert_##ToTy##_sat_##Rnd##_##FromSuf(FromTy x) { \ + ToTy r; \ + if (sizeof(ToTy) >= sizeof(long)) { \ + r = (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(x); \ + bool le = (x <= (FromTy)0); \ + bool ge = (x >= (FromTy)Max); \ + r = le?((ToTy) 0):r; \ + r = ge?((ToTy) Max):r; \ + } else { \ + FromTy s = min(max(x, (FromTy)0), (FromTy)Max); \ + r = (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(s); \ + } \ + return r; \ +} + +#define AllFloatToIntegerRoundingConv(Ty,TySuf) \ + FloatToIntegerRoundingConv(float,f32,Ty,TySuf,rte) \ + FloatToIntegerRoundingConv(float,f32,Ty,TySuf,rtn) \ + FloatToIntegerRoundingConv(float,f32,Ty,TySuf,rtp) \ + FloatToIntegerRoundingConv(float,f32,Ty,TySuf,rtz) + +#define AllFloatToIntegerSatRoundingConv(Ty,TySuf,Min,Max) \ + FloatToIntegerSatRoundingConv(float,f32,Ty,TySuf,rte,Min,Max) \ + FloatToIntegerSatRoundingConv(float,f32,Ty,TySuf,rtn,Min,Max) \ + FloatToIntegerSatRoundingConv(float,f32,Ty,TySuf,rtp,Min,Max) \ + FloatToIntegerSatRoundingConv(float,f32,Ty,TySuf,rtz,Min,Max) + +#define AllFloatToUnsignedSatRoundingConv(Ty,TySuf,Max) \ + FloatToUnsignedSatRoundingConv(float,f32,Ty,TySuf,rte,Max) \ + FloatToUnsignedSatRoundingConv(float,f32,Ty,TySuf,rtn,Max) \ + FloatToUnsignedSatRoundingConv(float,f32,Ty,TySuf,rtp,Max) \ + FloatToUnsignedSatRoundingConv(float,f32,Ty,TySuf,rtz,Max) + +#define AllDoubleToIntegerRoundingConv(Ty,TySuf) \ + FloatToIntegerRoundingConv(double,f64,Ty,TySuf,rte) \ + FloatToIntegerRoundingConv(double,f64,Ty,TySuf,rtn) \ + FloatToIntegerRoundingConv(double,f64,Ty,TySuf,rtp) \ + FloatToIntegerRoundingConv(double,f64,Ty,TySuf,rtz) + +#define AllDoubleToIntegerSatRoundingConv(Ty,TySuf,Min,Max) \ + FloatToIntegerSatRoundingConv(double,f64,Ty,TySuf,rte,Min,Max) \ + FloatToIntegerSatRoundingConv(double,f64,Ty,TySuf,rtn,Min,Max) \ + FloatToIntegerSatRoundingConv(double,f64,Ty,TySuf,rtp,Min,Max) \ + FloatToIntegerSatRoundingConv(double,f64,Ty,TySuf,rtz,Min,Max) + +#define AllDoubleToUnsignedSatRoundingConv(Ty,TySuf,Max) \ + FloatToUnsignedSatRoundingConv(double,f64,Ty,TySuf,rte,Max) \ + FloatToUnsignedSatRoundingConv(double,f64,Ty,TySuf,rtn,Max) \ + FloatToUnsignedSatRoundingConv(double,f64,Ty,TySuf,rtp,Max) \ + FloatToUnsignedSatRoundingConv(double,f64,Ty,TySuf,rtz,Max) + +#define FloatingPointRoundingConv(FromTy,FromSuf,ToTy,ToSuf,Rnd) \ +__attribute__((always_inline)) ToTy \ +__convert_##ToTy##_##Rnd##_##FromSuf(FromTy x) { \ + return (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(x); \ +} + +#define AllIntegerToFloatRoundingConv(Ty,TySuf) \ + FloatingPointRoundingConv(Ty,TySuf,float,f32,rtn) \ + FloatingPointRoundingConv(Ty,TySuf,float,f32,rtp) \ + FloatingPointRoundingConv(Ty,TySuf,float,f32,rtz) + +#define AllIntegerToDoubleRoundingConv(Ty,TySuf) \ + FloatingPointRoundingConv(Ty,TySuf,double,f64,rtn) \ + FloatingPointRoundingConv(Ty,TySuf,double,f64,rtp) \ + FloatingPointRoundingConv(Ty,TySuf,double,f64,rtz) + +// Prototypes for conversion intrinsics + +// float to integer conversion intrinsics +ConvIntrinPrototypeSet(float,f32,int,s32) +ConvIntrinPrototypeSet(float,f32,uint,u32) +ConvIntrinPrototypeSet(float,f32,long,s64) +ConvIntrinPrototypeSet(float,f32,ulong,u64) + +// double to integer conversion intrinsics +ConvIntrinPrototypeSet(double,f64,int,s32) +ConvIntrinPrototypeSet(double,f64,uint,u32) +ConvIntrinPrototypeSet(double,f64,long,s64) +ConvIntrinPrototypeSet(double,f64,ulong,u64) + +// integer to float conversion intrinsics +ConvIntrinPrototypeSet(int,i32,float,f32) +ConvIntrinPrototypeSet(uint,u32,float,f32) +ConvIntrinPrototypeSet(long,i64,float,f32) +ConvIntrinPrototypeSet(ulong,u64,float,f32) + +// long to double conversion intrinsics +ConvIntrinPrototypeSet(long,i64,double,f64) +ConvIntrinPrototypeSet(ulong,u64,double,f64) + +// double to float conversion intrinsics +ConvIntrinPrototypeSet(double,f64,float,f32) + +// Defintions for conversion functions + +// float to integer conversions +AllFloatToIntegerRoundingConv(char,s32) +AllFloatToIntegerRoundingConv(short,s32) +AllFloatToIntegerRoundingConv(int,s32) +AllFloatToIntegerRoundingConv(long,s64) + +AllFloatToIntegerRoundingConv(uchar,u32) +AllFloatToIntegerRoundingConv(ushort,u32) +AllFloatToIntegerRoundingConv(uint,u32) +AllFloatToIntegerRoundingConv(ulong,u64) + +AllFloatToIntegerSatRoundingConv(char,s32,CHAR_MIN,CHAR_MAX) +AllFloatToIntegerSatRoundingConv(short,s32,SHRT_MIN,SHRT_MAX) +AllFloatToIntegerSatRoundingConv(int,s32,INT_MIN,INT_MAX) +AllFloatToIntegerSatRoundingConv(long,s64,LONG_MIN,LONG_MAX) + +AllFloatToUnsignedSatRoundingConv(uchar,u32,UCHAR_MAX) +AllFloatToUnsignedSatRoundingConv(ushort,u32,USHRT_MAX) +AllFloatToUnsignedSatRoundingConv(uint,u32,UINT_MAX) +AllFloatToUnsignedSatRoundingConv(ulong,u64,ULONG_MAX) + +// double to integer conversions +AllDoubleToIntegerRoundingConv(char,s32) +AllDoubleToIntegerRoundingConv(short,s32) +AllDoubleToIntegerRoundingConv(int,s32) +AllDoubleToIntegerRoundingConv(long,s64) + +AllDoubleToIntegerRoundingConv(uchar,u32) +AllDoubleToIntegerRoundingConv(ushort,u32) +AllDoubleToIntegerRoundingConv(uint,u32) +AllDoubleToIntegerRoundingConv(ulong,u64) + +AllDoubleToIntegerSatRoundingConv(char,s32,CHAR_MIN,CHAR_MAX) +AllDoubleToIntegerSatRoundingConv(short,s32,SHRT_MIN,SHRT_MAX) +AllDoubleToIntegerSatRoundingConv(int,s32,INT_MIN,INT_MAX) +AllDoubleToIntegerSatRoundingConv(long,s64,LONG_MIN,LONG_MAX) + +AllDoubleToUnsignedSatRoundingConv(uchar,u32,UCHAR_MAX) +AllDoubleToUnsignedSatRoundingConv(ushort,u32,USHRT_MAX) +AllDoubleToUnsignedSatRoundingConv(uint,u32,UINT_MAX) +AllDoubleToUnsignedSatRoundingConv(ulong,u64,ULONG_MAX) + +// integer to float +AllIntegerToFloatRoundingConv(int,i32) +AllIntegerToFloatRoundingConv(uint,u32) +AllIntegerToFloatRoundingConv(long,i64) +AllIntegerToFloatRoundingConv(ulong,u64) + +// long/ulong to double +AllIntegerToDoubleRoundingConv(long,i64) +AllIntegerToDoubleRoundingConv(ulong,u64) + +// double to float conversions +FloatingPointRoundingConv(double,f64,float,f32,rtn) +FloatingPointRoundingConv(double,f64,float,f32,rtp) +FloatingPointRoundingConv(double,f64,float,f32,rtz)
diff --git a/amd-builtins/devenq/devenq.h b/amd-builtins/devenq/devenq.h new file mode 100644 index 0000000..7232805 --- /dev/null +++ b/amd-builtins/devenq/devenq.h
@@ -0,0 +1,277 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable +#define CLK_ENQUEUE_FAILURE -1 + +// XXX This was copied from runtime/device/gpu/gpuschedcl.cpp + +//! AmdAqlWrap slot state +enum AqlWrapState { + AQL_WRAP_FREE = 0, + AQL_WRAP_RESERVED, + AQL_WRAP_READY, + AQL_WRAP_MARKER, + AQL_WRAP_BUSY, + AQL_WRAP_DONE +}; + +//! Profiling states +enum ProfilingState { + PROFILING_COMMAND_START = 0, + PROFILING_COMMAND_END, + PROFILING_COMMAND_COMPLETE +}; + +//! OCL dispatch condition flags +// --- this is unused in the library and I've asked German to remove +// it in favor of the clang enum +enum ClFlags { + NO_WAIT = 0, + WAIT_PARENT, + WAIT_WORK_GROUP +}; + +typedef struct _HsaAqlDispatchPacket { + uint mix; + ushort workgroup_size[3]; + ushort reserved2; + uint grid_size[3]; + uint private_segment_size_bytes; + uint group_segment_size_bytes; + ulong kernel_object_address; + ulong kernel_arg_address; + ulong reserved3; + ulong completion_signal; +} HsaAqlDispatchPacket; + +typedef struct _AmdVQueueHeader { + uint aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64). + uint event_slot_num; //!< [LRO] The number of kernel events in the events buffer + ulong event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events + ulong event_slots; //!< [LRO] Pointer to a buffer for the events. + // Array of event_slot_num entries of AmdEvent + ulong aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots + uint command_counter; //!< [LRW] The global counter for the submitted commands into the queue + uint wait_size; //!< [LRO] The wait list size (in clk_event_t) + uint arg_size; //!< [LRO] The size of argument buffer (in bytes) + uint reserved0; //!< For the future usage + ulong kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry) + uint reserved[2]; //!< For the future usage +} AmdVQueueHeader; + +typedef struct _AmdAqlWrap { + uint state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY, + // MARKER, BUSY and DONE. The block could be returned back to a free state. + uint enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start - + // (KERNEL_ENQUEUE_FLAGS_T) + // NO_WAIT - we just start processing + // WAIT_PARENT - check if parent_wrap->state is done and then start processing + // WAIT_WORK_GROUP currently == WAIT_PARENT + uint command_id; //!< [LWO/SRO] The unique command ID + uint child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels. + // It's incremented on the + // start and decremented on the finish. The parent kernel can be considered as + // done when the value is 0 and the state is DONE + ulong completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t) + ulong parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*) + ulong wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) + uint wait_num; //!< [LWO/SRO] The number of cl_event_wait objects + uint reserved[5]; //!< For the future usage + HsaAqlDispatchPacket aql; //!< [LWO/SRO] AQL packet - 64 bytes AQL packet +} AmdAqlWrap; + +typedef struct _AmdEvent { + uint state; //!< [LRO/SRW] Event state: START, END, COMPLETE + uint counter; //!< [LRW] Event retain/release counter. 0 means the event is free + ulong timer[3]; //!< [LRO/SWO] Timer values for profiling for each state +} AmdEvent; + +// XXX This is adapted from hsa.h + +// This is an OpenCLized hsa_control_directives_t +typedef struct _HsaControlDirectives { + ulong enabled_control_directives; + ushort enable_break_exceptions; + ushort enable_detect_exceptions; + uint max_dynamic_group_size; + uint max_flat_grid_size; + uint max_flat_workgroup_size; + uint requested_workgroups_per_cu; + uint required_grid_size[3]; + uint required_workgroup_size[3]; + uchar required_dim; + uchar reserved[75]; +} HsaControlDirectives; + +// This is an OpenCLized amd_kernel_code_t +typedef struct _AmdKernelCode { + uint amd_code_version_major; + uint amd_code_version_minor; + uint struct_byte_size; + uint target_chip; + ulong kernel_code_entry_byte_offset; + ulong kernel_code_prefetch_byte_offset; + ulong kernel_code_prefetch_byte_size; + ulong max_scratch_backing_memory_byte_size; + ulong compute_pgm_resource_registers; + uint enables_and_flags; + uint gds_segment_byte_size; + ushort debug_wavefront_private_segment_offset_sgpr; + ushort debug_private_segment_buffer_sgpr; + ushort wavefront_sgpr_count; + ushort workitem_vgpr_count; + ulong kernarg_segment_byte_size; + uint workitem_private_segment_byte_size; + uint workgroup_group_segment_byte_size; + uint workgroup_fbarrier_count; + uchar kernarg_segment_alignment; + uchar group_segment_alignment; + uchar private_segment_alignment; + uchar code_alignment; + uint code_type; + uint code_properties; + uchar wavefront_size; + uchar optimization_level; + uchar hsail_profile; + uchar hsail_machine_model; + uint hsail_version_major; + uint hsail_version_minor; + ushort hsail_target_options; + ushort reserved3; + HsaControlDirectives control_directive; +} AmdKernelCode; + + +// Library only from here + +// XXX this needs to match workgroup/wg.h MAX_WAVES_PER_SIMD +#define CL_DEVICE_MAX_WORK_GROUP_SIZE 256 + +// ABI has 6 special leading arguments: +// global_offset[3], printf_buf, default vqueue pointer, and self AqlWrap pointer +#define NUM_SPECIAL_ARGS 6 +extern __attribute__((const)) uint __hsail_ld_kernarg_u32(uint); +extern __attribute__((const)) ulong __hsail_ld_kernarg_u64(uint); + +static inline __global AmdVQueueHeader * +get_vqueue(void) +{ + size_t vq; + + if (sizeof(size_t) == 4) + vq = __hsail_ld_kernarg_u32(4*4); + else + vq = __hsail_ld_kernarg_u64(4*8); + + return (__global AmdVQueueHeader *)vq; +} + +static inline __global AmdAqlWrap * +get_aql_wrap(void) +{ + size_t aw; + + if (sizeof(size_t) == 4) + aw = __hsail_ld_kernarg_u32(5*4); + else + aw = __hsail_ld_kernarg_u64(5*8); + + return (__global AmdAqlWrap *)aw; +} + +static inline __global void * +get_printf_ptr(void) +{ + size_t pb; + + if (sizeof(size_t) == 4) + pb = __hsail_ld_kernarg_u32(3*4); + else + pb = __hsail_ld_kernarg_u64(3*8); + + return (__global void *)pb; +} + +typedef struct _NdRange { + uint dim; + size_t goff[3]; + size_t gws[3]; + size_t lws[3]; +} NdRange; + +// reserve a slot in a bitmask controlled resource +// n is the number of slots +static inline int +reserve_slot(__global uint * restrict mask, uint n) +{ + n >>= 5; + uint i, j, k, v, vv, z; + + /* Spread the starting points */ + i = get_sub_group_local_id() % n; + + /* Allow only one pass */ + for (j=0,k=i;j<n;++j) { + __global atomic_uint *p = (__global atomic_uint *)(mask + k); + v = atomic_load_explicit(p, memory_order_acquire, memory_scope_device); + for (;;) { + z = ctz(~v); + if (z == 32U) + break; + vv = v | (1U << z); + if (atomic_compare_exchange_strong_explicit(p, &v, vv, memory_order_acq_rel, memory_order_acquire, memory_scope_device)) + break; + } + if (z < 32U) + break; + k = k == n-1 ? 0 : k+1; + } + + k = (k << 5) + z; + return z < 32U ? (int)k : -1; +} + +// release slot in a bitmask controlled resource +// i is the slot number +static inline void +release_slot(__global uint * restrict mask, uint i) +{ + /* uint b = ~(1UL << (i & 0x1f)); */ + uint b = ~amd_bfm(1U, i); + __global atomic_uint *p = (__global atomic_uint *)(mask + (i >> 5)); + uint v, vv; + + v = atomic_load_explicit(p, memory_order_acquire, memory_scope_device); + for (;;) { + vv = v & b; + if (atomic_compare_exchange_strong_explicit(p, &v, vv, memory_order_acq_rel, memory_order_acquire, memory_scope_device)) + break; + } +} + +static inline uint +align_up(uint start, uint align) +{ + return (start + align - 1U) & -align; +} +
diff --git a/amd-builtins/devenq/enqueue.cl b/amd-builtins/devenq/enqueue.cl new file mode 100644 index 0000000..c944874 --- /dev/null +++ b/amd-builtins/devenq/enqueue.cl
@@ -0,0 +1,330 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __OPENCL_C_VERSION__ >= 200 + +#include "devenq.h" + +static inline void +copy_waitlist(__global AmdEvent **dst, __global AmdEvent **src, uint n) +{ + uint i; + for (i=0; i<n; ++i) + dst[i] = src[i]; +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) queue_t +get_default_queue(void) +{ + return (queue_t)get_vqueue(); +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) int +enqueue_marker(queue_t q, uint nwl, const clk_event_t *wl, clk_event_t *re) +{ + __global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q; + if (nwl > vq->wait_size) + return CLK_ENQUEUE_FAILURE; + + // Get a wrap slot + __global uint *amask = (__global uint *)vq->aql_slot_mask; + int ai = reserve_slot(amask, vq->aql_slot_num); + if (ai < 0) + return CLK_ENQUEUE_FAILURE; + + // Get a return event slot + __global uint *emask = (__global uint *)vq->event_slot_mask; + int ei = reserve_slot(emask, vq->event_slot_num); + if (ei < 0) { + release_slot(amask, ai); + return CLK_ENQUEUE_FAILURE; + } + + // Initialize return event + __global AmdEvent *ev = (__global AmdEvent *)vq->event_slots + ei; + ev->state = CL_SUBMITTED; + ev->counter = 1; + ev->timer[0] = 0; + ev->timer[1] = 0; + ev->timer[2] = 0; + + // Initialize wrap + __global AmdAqlWrap *me = get_aql_wrap(); + __global AmdAqlWrap *aw = (__global AmdAqlWrap *)(vq + 1) + ai; + + aw->enqueue_flags = CLK_ENQUEUE_FLAGS_NO_WAIT; + aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device); + aw->child_counter = 0; + aw->completion = (ulong)ev; + aw->parent_wrap = (ulong)me; + + if (nwl > 0) + copy_waitlist((__global AmdEvent **)aw->wait_list, (__global AmdEvent **)wl, nwl); + + aw->wait_num = nwl; + + // A marker is never enqueued so ignore displatch packet + + // Tell the scheduler + atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device); + atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_MARKER, memory_order_release, memory_scope_device); + + *re = (clk_event_t)ev; + return 0; +} + +// int +// __enqueue_internal_{0,1,.,10}[_events] ( +// queue_t q, +// int flags, +// int dims, size_t goff[3], size_t gsize[3], size_t lsize[3], +// __global void * something_like_function_pointer, +// __global void * wrap_ptr_from_prep +// [, uint size0, uint align0 +// [, uint size1, uint align1 +// [, uint size2, uint align2 +// [, uint size3, uint align3 +// ...]]]]]] ); + +// Help with size and alignment handling +#define _SA_ARGS10 _SA_ARGS9, uint sz9, uint al9 +#define _SA_ARGS9 _SA_ARGS8, uint sz8, uint al8 +#define _SA_ARGS8 _SA_ARGS7, uint sz7, uint al7 +#define _SA_ARGS7 _SA_ARGS6, uint sz6, uint al6 +#define _SA_ARGS6 _SA_ARGS5, uint sz5, uint al5 +#define _SA_ARGS5 _SA_ARGS4, uint sz4, uint al4 +#define _SA_ARGS4 _SA_ARGS3, uint sz3, uint al3 +#define _SA_ARGS3 _SA_ARGS2, uint sz2, uint al2 +#define _SA_ARGS2 _SA_ARGS1, uint sz1, uint al1 +#define _SA_ARGS1 _SA_ARGS0, uint sz0, uint al0 +#define _SA_ARGS0 + +#define SA_ARGS(N) _SA_ARGS##N + +#define _SET_KARG10 _SET_KARG9; lo = align_up(lo, al9); args[6+9] = lo; lo += sz9 +#define _SET_KARG9 _SET_KARG8; lo = align_up(lo, al8); args[6+8] = lo; lo += sz8 +#define _SET_KARG8 _SET_KARG7; lo = align_up(lo, al7); args[6+7] = lo; lo += sz7 +#define _SET_KARG7 _SET_KARG6; lo = align_up(lo, al6); args[6+6] = lo; lo += sz6 +#define _SET_KARG6 _SET_KARG5; lo = align_up(lo, al5); args[6+5] = lo; lo += sz5 +#define _SET_KARG5 _SET_KARG4; lo = align_up(lo, al4); args[6+4] = lo; lo += sz4 +#define _SET_KARG4 _SET_KARG3; lo = align_up(lo, al3); args[6+3] = lo; lo += sz3 +#define _SET_KARG3 _SET_KARG2; lo = align_up(lo, al2); args[6+2] = lo; lo += sz2 +#define _SET_KARG2 _SET_KARG1; lo = align_up(lo, al1); args[6+1] = lo; lo += sz1 +#define _SET_KARG1 lo = align_up(lo, al0); args[6+0] = lo; lo += sz0 +#define _SET_KARG0 + +#define SET_KARG(N) _SET_KARG##N + +#define GEN(N) \ +__attribute__((always_inline)) \ +int \ +__enqueue_internal_##N(queue_t q, int flags, ndrange_t ndr_type, \ + __global void *fp, __global void *aqlWrap SA_ARGS(N)) \ +{ \ + __global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q; \ + __global AmdAqlWrap *me = get_aql_wrap(); \ + __global uint *amask = (__global uint *)vq->aql_slot_mask; \ + __global AmdAqlWrap *aw = (__global AmdAqlWrap *) aqlWrap; \ + int ai = aw - (__global AmdAqlWrap *)(vq + 1); \ + __private NdRange *ndr = (__private NdRange *) &ndr_type; \ + \ + /* Skip check of dim for now */ \ + if (mul24(mul24((uint)ndr->lws[0], (uint)ndr->lws[1]), (uint)ndr->lws[2]) > \ + CL_DEVICE_MAX_WORK_GROUP_SIZE) { \ + release_slot(amask, ai); \ + return CLK_ENQUEUE_FAILURE; \ + } \ + \ + /* This is the current index-based approach, not the ldk based approach */ \ + __global AmdKernelCode **kt = (__global AmdKernelCode **)vq->kernel_table; \ + uint ki = (uint)fp; \ + __global AmdKernelCode *kc = kt[ki]; \ + \ + aw->enqueue_flags = flags; \ + \ + aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \ + aw->child_counter = 0; \ + aw->completion = 0; \ + aw->parent_wrap = (ulong)me; \ + aw->wait_num = 0; \ + \ + aw->aql.mix = ((uint)ndr->dim << 16) | (0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0); \ + aw->aql.workgroup_size[0] = (ushort)ndr->lws[0]; \ + aw->aql.workgroup_size[1] = (ushort)ndr->lws[1]; \ + aw->aql.workgroup_size[2] = (ushort)ndr->lws[2]; \ + aw->aql.grid_size[0] = (uint)ndr->gws[0]; \ + aw->aql.grid_size[1] = (uint)ndr->gws[1]; \ + aw->aql.grid_size[2] = (uint)ndr->gws[2]; \ + aw->aql.private_segment_size_bytes = kc->workitem_private_segment_byte_size; \ + aw->aql.group_segment_size_bytes = 0; \ + aw->aql.kernel_object_address = (ulong)kc; \ + aw->aql.completion_signal = 0; \ + \ + /* Set non-capture arguments */ \ + __global size_t *args = (__global size_t *)aw->aql.kernel_arg_address; \ + args[0] = ndr->goff[0]; \ + args[1] = ndr->goff[1]; \ + args[2] = ndr->goff[2]; \ + args[3] = (size_t)get_printf_ptr(); \ + args[4] = (size_t)vq; \ + args[5] = (size_t)aw; \ + \ + uint lo0 = kc->workgroup_group_segment_byte_size; \ + uint lo = lo0; \ + SET_KARG(N); \ + aw->aql.group_segment_size_bytes = lo - lo0; \ + \ + /* Tell the scheduler */ \ + atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \ + atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_READY, memory_order_release, memory_scope_device); \ + return 0; \ +} + +GEN(0) +GEN(1) +GEN(2) +GEN(3) +GEN(4) +GEN(5) +GEN(6) +GEN(7) +GEN(8) +GEN(9) +GEN(10) + +// Now the versions with events + +#define EGEN(N) \ +__attribute__((always_inline)) \ +int \ +__enqueue_internal_##N##_events(queue_t q, int flags, ndrange_t ndr_type, \ + uint nwl, clk_event_t *wl, clk_event_t *re, \ + __global void *fp, __global void *aqlWrap SA_ARGS(N)) \ +{ \ + __global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q; \ + __global uint *amask = (__global uint *)vq->aql_slot_mask; \ + __global AmdAqlWrap *aw = (__global AmdAqlWrap *) aqlWrap; \ + int ai = aw - (__global AmdAqlWrap *)(vq + 1); \ + __private NdRange *ndr = (__private NdRange *) &ndr_type; \ + \ + /* Skip check of dim for now */ \ + if (mul24(mul24((uint)ndr->lws[0], (uint)ndr->lws[1]), (uint)ndr->lws[2]) > \ + CL_DEVICE_MAX_WORK_GROUP_SIZE | nwl > vq->wait_size) { \ + release_slot(amask, ai); \ + return CLK_ENQUEUE_FAILURE; \ + } \ + \ + __global AmdAqlWrap *me = get_aql_wrap(); \ + __global AmdEvent *ev = NULL; \ + \ + if (re != NULL) { \ + /* Get a return event slot */ \ + __global uint *emask = (__global uint *)vq->event_slot_mask; \ + int ei = reserve_slot(emask, vq->event_slot_num); \ + if (ei < 0) { \ + release_slot(amask, ai); \ + return CLK_ENQUEUE_FAILURE; \ + } \ + \ + /* Initialize return event */ \ + ev = (__global AmdEvent *)vq->event_slots + ei; \ + ev->state = CL_SUBMITTED; \ + ev->counter = 1; \ + ev->timer[0] = 0; \ + ev->timer[1] = 0; \ + ev->timer[2] = 0; \ + } \ + \ + /* This is the current index-based approach, not the ldk based approach */ \ + __global AmdKernelCode **kt = (__global AmdKernelCode **)vq->kernel_table; \ + uint ki = (uint)fp; \ + __global AmdKernelCode *kc = kt[ki]; \ + \ + aw->enqueue_flags = flags; \ + \ + aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \ + aw->child_counter = 0; \ + aw->completion = 0; \ + aw->parent_wrap = (ulong)me; \ + \ + aw->aql.mix = ((uint)ndr->dim << 16) | (0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0); \ + aw->aql.workgroup_size[0] = (ushort)ndr->lws[0]; \ + aw->aql.workgroup_size[1] = (ushort)ndr->lws[1]; \ + aw->aql.workgroup_size[2] = (ushort)ndr->lws[2]; \ + aw->aql.grid_size[0] = (uint)ndr->gws[0]; \ + aw->aql.grid_size[1] = (uint)ndr->gws[1]; \ + aw->aql.grid_size[2] = (uint)ndr->gws[2]; \ + aw->aql.private_segment_size_bytes = kc->workitem_private_segment_byte_size; \ + aw->aql.group_segment_size_bytes = 0; \ + aw->aql.kernel_object_address = (ulong)kc; \ + aw->aql.completion_signal = 0; \ + \ + /* Set non-capture arguments */ \ + __global size_t *args = (__global size_t *)aw->aql.kernel_arg_address; \ + args[0] = ndr->goff[0]; \ + args[1] = ndr->goff[1]; \ + args[2] = ndr->goff[2]; \ + args[3] = (size_t)get_printf_ptr(); \ + args[4] = (size_t)vq; \ + args[5] = (size_t)aw; \ + \ + uint lo0 = kc->workgroup_group_segment_byte_size; \ + uint lo = lo0; \ + SET_KARG(N); \ + aw->aql.group_segment_size_bytes = lo - lo0; \ + \ + /* Copy wait list */ \ + if (nwl > 0) \ + copy_waitlist((__global AmdEvent **)aw->wait_list, (__global AmdEvent **)wl, nwl); \ + \ + aw->wait_num = nwl; \ + \ + /* Tell the scheduler */ \ + atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \ + atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_MARKER, memory_order_release, memory_scope_device); \ + \ + if (re != NULL) \ + *re = (clk_event_t)ev; \ + \ + return 0; \ +} + +EGEN(0) +EGEN(1) +EGEN(2) +EGEN(3) +EGEN(4) +EGEN(5) +EGEN(6) +EGEN(7) +EGEN(8) +EGEN(9) +EGEN(10) + +#endif +
diff --git a/amd-builtins/devenq/eprep.cl b/amd-builtins/devenq/eprep.cl new file mode 100644 index 0000000..93a771d --- /dev/null +++ b/amd-builtins/devenq/eprep.cl
@@ -0,0 +1,128 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __OPENCL_C_VERSION__ >= 200 + +#include "devenq.h" + +static inline void +copy_captured_context(__global void *d, __private void *s, uint size, uint align) +{ + if (align == 2) { + __global short *d2 = (__global short *)d; + __private short *s2 = (__private short *)s; + uint i; + uint n = size / align; + + for (i=0; i<n; ++i) + d2[i] = s2[i]; + } else if (align == 4) { + __global int *d4 = (__global int *)d; + __private int *s4 = (__private int *)s; + uint i; + uint n = size / align; + + for (i=0; i<n; ++i) + d4[i] = s4[i]; + } else if (align == 8) { + __global long *d8 = (__global long *)d; + __private long *s8 = (__private long *)s; + uint i; + uint n = size / align; + + for (i=0; i<n; ++i) + d8[i] = s8[i]; + } else if (align == 16) { + __global long2 *d16 = (__global long2 *)d; + __private long2 *s16 = (__private long2 *)s; + uint i; + uint n = size / align; + + for (i=0; i<n; ++i) + d16[i] = s16[i]; + } else if (align == 32 || align == 64 || align == 128) { + __global long4 *d32 = (__global long4 *)d; + __private long4 *s32 = (__private long4 *)s; + uint i; + uint n = size / 32U; + + for (i=0; i<n; ++i) + d32[i] = s32[i]; + } else { + __global char *d1 = (__global char *)d; + __private char *s1 = (__private char *)s; + uint i; + uint n = size; + + for (i=0; i<n; ++i) + d1[i] = s1[i]; + } +} + +// enqueue_prep attempts to allocate an AqlWrap and copy the +// context into the kernarg area +// returns: +// 1: a int indicating the allocation is successful +// 2: a pointer to the wrap itself to be passed to the actual enqueue +// call +static int +eprep(queue_t q, uint lsize, uint csize, uint calign, __private void *cptr, __global void* private* private wretp) +{ + __global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q; + + lsize = align_up(lsize, calign); + if (lsize + csize > vq->arg_size) + return CLK_ENQUEUE_FAILURE; + + int s = reserve_slot((__global uint *)vq->aql_slot_mask, vq->aql_slot_num); + if (s < 0) + return CLK_ENQUEUE_FAILURE; + + __global AmdAqlWrap *a = (__global AmdAqlWrap *)(vq + 1); + __global void *kptr = (__global void *)((size_t)a[s].aql.kernel_arg_address + NUM_SPECIAL_ARGS*sizeof(size_t)); + copy_captured_context(kptr, cptr, csize, calign); + + *wretp = (__global void *)(a + s); + return CLK_SUCCESS; +} + +#define GEN(N) \ +__attribute__((always_inline)) int \ +__enqueue_prep_##N(queue_t q, size_t csize, uint calign, __private void *cptr, __global void* private* private wretp) \ +{ \ + return eprep(q, (uint)((N + NUM_SPECIAL_ARGS)*sizeof(size_t)), (uint)csize, calign, cptr, wretp); \ +} + +GEN(0) +GEN(1) +GEN(2) +GEN(3) +GEN(4) +GEN(5) +GEN(6) +GEN(7) +GEN(8) +GEN(9) +GEN(10) + +#endif +
diff --git a/amd-builtins/devenq/events.cl b/amd-builtins/devenq/events.cl new file mode 100644 index 0000000..b6d7f50 --- /dev/null +++ b/amd-builtins/devenq/events.cl
@@ -0,0 +1,110 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __OPENCL_C_VERSION__ >= 200 + +#include "devenq.h" + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) void +retain_event(clk_event_t e) +{ + __global AmdEvent *ev = (__global AmdEvent *)e; + atomic_fetch_add_explicit((__global atomic_uint *)&ev->counter, 1U, memory_order_acq_rel, memory_scope_device); +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) void +release_event(clk_event_t e) +{ + __global AmdEvent *ev = (__global AmdEvent *)e; + uint c = atomic_fetch_sub_explicit((__global atomic_uint *)&ev->counter, 1U, memory_order_acq_rel, memory_scope_device); + if (c == 1U) { + __global AmdVQueueHeader *vq = get_vqueue(); + __global uint *emask = (__global uint *)vq->event_slot_mask; + __global AmdEvent *eb = (__global AmdEvent *)vq->event_slots; + uint i = ev - eb; + release_slot(emask, i); + } +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) clk_event_t +create_user_event(void) +{ + __global AmdVQueueHeader *vq = get_vqueue(); + __global uint *emask = (__global uint *)vq->event_slot_mask; + int i = reserve_slot(emask, vq->event_slot_num); + + if (i >= 0) { + __global AmdEvent *ev = (__global AmdEvent *)vq->event_slots + i; + ev->state = CL_SUBMITTED; + ev->counter = 1; + ev->timer[0] = 0; + ev->timer[1] = 0; + ev->timer[2] = 0; + return (clk_event_t)ev; + } else + return (clk_event_t)(__global void *)NULL; +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) bool +is_valid_event(clk_event_t e) +{ + return !((__global AmdEvent *)e == NULL); +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) void +set_user_event_status(clk_event_t e, int s) +{ + __global AmdEvent *ev = (__global AmdEvent *)e; + atomic_store_explicit((__global atomic_uint *)&ev->state, (uint)s, memory_order_release, memory_scope_device); +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) void +capture_event_profiling_info(clk_event_t e, clk_profiling_info n, __global void *p) +{ + // Currently the second argument must be CLK_PROFILING_COMMAND_EXEC_TIME + __global AmdEvent *ev = (__global AmdEvent *)e; + __global ulong *t = (__global ulong *)ev->timer; + + ((__global ulong *)p)[0] = t[PROFILING_COMMAND_END] - t[PROFILING_COMMAND_START]; + ((__global ulong *)p)[1] = t[PROFILING_COMMAND_COMPLETE] - t[PROFILING_COMMAND_START]; +} + +#endif +
diff --git a/amd-builtins/devenq/getkern.cl b/amd-builtins/devenq/getkern.cl new file mode 100644 index 0000000..4dd7893 --- /dev/null +++ b/amd-builtins/devenq/getkern.cl
@@ -0,0 +1,46 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __OPENCL_C_VERSION__ >= 200 + +#include "devenq.h" + +// Currently we have no information about the block with +// which to make block specific decisions. Therefore these +// library calls corresponding to all of the possible +// get_kernel_* built in functions have no argument at all +// and return a reasonable constant + +__attribute__((always_inline)) uint +__get_kernel_work_group_size_internal(void) +{ + return (uint)CL_DEVICE_MAX_WORK_GROUP_SIZE; +} + +__attribute__((always_inline)) uint +__get_kernel_preferred_work_group_size_multiple_internal(void) +{ + return 64U; +} + +#endif +
diff --git a/amd-builtins/devenq/ndrange.cl b/amd-builtins/devenq/ndrange.cl new file mode 100644 index 0000000..68d0f19 --- /dev/null +++ b/amd-builtins/devenq/ndrange.cl
@@ -0,0 +1,196 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __OPENCL_C_VERSION__ >= 200 + +#include "devenq.h" + +// 1D variants + +__attribute__((overloadable, always_inline)) ndrange_t +ndrange_1D(size_t gws) +{ + ndrange_t ret; + __private NdRange *rp = (__private NdRange *)&ret; + rp->dim = 1; + rp->goff[0] = 0; + rp->goff[1] = 0; + rp->goff[2] = 0; + rp->gws[0] = gws; + rp->gws[1] = 1; + rp->gws[2] = 1; + rp->lws[0] = min(gws, (size_t)64); + rp->lws[1] = 1; + rp->lws[2] = 1; + return ret; +} + +__attribute__((overloadable, always_inline)) ndrange_t +ndrange_1D(size_t gws, size_t lws) +{ + ndrange_t ret; + __private NdRange *rp = (__private NdRange *)&ret; + rp->dim = 1; + rp->goff[0] = 0; + rp->goff[1] = 0; + rp->goff[2] = 0; + rp->gws[0] = gws; + rp->gws[1] = 1; + rp->gws[2] = 1; + rp->lws[0] = lws; + rp->lws[1] = 1; + rp->lws[2] = 1; + return ret; +} + +__attribute__((overloadable, always_inline)) ndrange_t +ndrange_1D(size_t goff, size_t gws, size_t lws) +{ + ndrange_t ret; + __private NdRange *rp = (__private NdRange *)&ret; + rp->dim = 1; + rp->goff[0] = goff; + rp->goff[1] = 0; + rp->goff[2] = 0; + rp->gws[0] = gws; + rp->gws[1] = 1; + rp->gws[2] = 1; + rp->lws[0] = lws; + rp->lws[1] = 1; + rp->lws[2] = 1; + return ret; +} + +// 2D variants + +__attribute__((overloadable, always_inline)) ndrange_t +ndrange_2D(size_t gws[2]) +{ + ndrange_t ret; + __private NdRange *rp = (__private NdRange *)&ret; + rp->dim = 2; + rp->goff[0] = 0; + rp->goff[1] = 0; + rp->goff[2] = 0; + rp->gws[0] = gws[0]; + rp->gws[1] = gws[1]; + rp->gws[2] = 1; + rp->lws[0] = min(gws[0], (size_t)8); + rp->lws[1] = min(gws[1], (size_t)8); + rp->lws[2] = 1; + return ret; +} + +__attribute__((overloadable, always_inline)) ndrange_t +ndrange_2D(size_t gws[2], size_t lws[2]) +{ + ndrange_t ret; + __private NdRange *rp = (__private NdRange *)&ret; + rp->dim = 2; + rp->goff[0] = 0; + rp->goff[1] = 0; + rp->goff[2] = 0; + rp->gws[0] = gws[0]; + rp->gws[1] = gws[1]; + rp->gws[2] = 1; + rp->lws[0] = lws[0]; + rp->lws[1] = lws[1]; + rp->lws[2] = 1; + return ret; +} + +__attribute__((overloadable, always_inline)) ndrange_t +ndrange_2D(size_t goff[2], size_t gws[2], size_t lws[2]) +{ + ndrange_t ret; + __private NdRange *rp = (__private NdRange *)&ret; + rp->dim = 2; + rp->goff[0] = goff[0]; + rp->goff[1] = goff[1]; + rp->goff[2] = 0; + rp->gws[0] = gws[0]; + rp->gws[1] = gws[1]; + rp->gws[2] = 1; + rp->lws[0] = lws[0]; + rp->lws[1] = lws[1]; + rp->lws[2] = 1; + return ret; +} + +// 3D variants + +__attribute__((overloadable, always_inline)) ndrange_t +ndrange_3D(size_t gws[3]) +{ + ndrange_t ret; + __private NdRange *rp = (__private NdRange *)&ret; + rp->dim = 3; + rp->goff[0] = 0; + rp->goff[1] = 0; + rp->goff[2] = 0; + rp->gws[0] = gws[0]; + rp->gws[1] = gws[1]; + rp->gws[2] = gws[2]; + rp->lws[0] = min(gws[0], (size_t)4); + rp->lws[1] = min(gws[1], (size_t)4); + rp->lws[2] = min(gws[2], (size_t)4); + return ret; +} + +__attribute__((overloadable, always_inline)) ndrange_t +ndrange_3D(size_t gws[3], size_t lws[3]) +{ + ndrange_t ret; + __private NdRange *rp = (__private NdRange *)&ret; + rp->dim = 3; + rp->goff[0] = 0; + rp->goff[1] = 0; + rp->goff[2] = 0; + rp->gws[0] = gws[0]; + rp->gws[1] = gws[1]; + rp->gws[2] = gws[2]; + rp->lws[0] = lws[0]; + rp->lws[1] = lws[1]; + rp->lws[2] = lws[2]; + return ret; +} + +__attribute__((overloadable, always_inline)) ndrange_t +ndrange_3D(size_t goff[3], size_t gws[3], size_t lws[3]) +{ + ndrange_t ret; + __private NdRange *rp = (__private NdRange *)&ret; + rp->dim = 3; + rp->goff[0] = goff[0]; + rp->goff[1] = goff[1]; + rp->goff[2] = goff[2]; + rp->gws[0] = gws[0]; + rp->gws[1] = gws[1]; + rp->gws[2] = gws[2]; + rp->lws[0] = lws[0]; + rp->lws[1] = lws[1]; + rp->lws[2] = lws[2]; + return ret; +} + +#endif +
diff --git a/amd-builtins/geom/clamp.cl b/amd-builtins/geom/clamp.cl new file mode 100644 index 0000000..6b90960 --- /dev/null +++ b/amd-builtins/geom/clamp.cl
@@ -0,0 +1,56 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable,weak,always_inline)) float +clamp(float x, float minval, float maxval) +{ + return fmin(fmax(x, minval), maxval); +} + +__attribute__((overloadable,weak,always_inline)) double +clamp(double x, double minval, double maxval) +{ + // We think there is a bug in section 9.3.3 and match the float version instead + return fmin(fmax(x, minval), maxval); +} + +// Integer clamp functions + +#define ICLAMP(TY) \ +__attribute__((overloadable,weak,always_inline)) TY \ +clamp(TY x, TY minval, TY maxval) \ +{ \ + return min(max(x, minval), maxval); \ +} + +ICLAMP(char) +ICLAMP(uchar) + +ICLAMP(short) +ICLAMP(ushort) + +ICLAMP(int) +ICLAMP(uint) + +ICLAMP(long) +ICLAMP(ulong) +
diff --git a/amd-builtins/geom/cross.cl b/amd-builtins/geom/cross.cl new file mode 100644 index 0000000..4af3851 --- /dev/null +++ b/amd-builtins/geom/cross.cl
@@ -0,0 +1,56 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, weak,always_inline)) float3 +cross(float3 p0, float3 p1) +{ + return (float3)(p0.y * p1.z - p0.z * p1.y, + p0.z * p1.x - p0.x * p1.z, + p0.x * p1.y - p0.y * p1.x); +} + +__attribute__((overloadable, weak,always_inline)) double3 +cross(double3 p0, double3 p1) +{ + return (double3)(p0.y * p1.z - p0.z * p1.y, + p0.z * p1.x - p0.x * p1.z, + p0.x * p1.y - p0.y * p1.x); +} + +__attribute__((overloadable, weak,always_inline)) float4 +cross(float4 p0, float4 p1) +{ + return (float4)(p0.y * p1.z - p0.z * p1.y, + p0.z * p1.x - p0.x * p1.z, + p0.x * p1.y - p0.y * p1.x, + p0.w * p1.w - p0.w * p1.w); +} + +__attribute__((overloadable, weak,always_inline)) double4 +cross(double4 p0, double4 p1) +{ + return (double4)(p0.y * p1.z - p0.z * p1.y, + p0.z * p1.x - p0.x * p1.z, + p0.x * p1.y - p0.y * p1.x, + p0.w * p1.w - p0.w * p1.w); +} +
diff --git a/amd-builtins/geom/degrees.cl b/amd-builtins/geom/degrees.cl new file mode 100644 index 0000000..06ca875 --- /dev/null +++ b/amd-builtins/geom/degrees.cl
@@ -0,0 +1,52 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, weak,always_inline)) float +degrees(float radians) +{ + // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F + return 0x1.ca5dc2p+5F * radians; +} + +__attribute__((overloadable, weak,always_inline)) double +degrees(double radians) +{ + // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F + return 0x1.ca5dc1a63c1f8p+5 * radians; +} + +//! Converts degrees to radians, i.e. (PI / 180) * degrees. +// +__attribute__((overloadable, weak,always_inline)) float +radians(float degrees) +{ + // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F + return 0x1.1df46ap-6F * degrees; +} + +__attribute__((overloadable, weak,always_inline)) double +radians(double degrees) +{ + // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F + return 0x1.1df46a2529d39p-6 * degrees; +} +
diff --git a/amd-builtins/geom/distance.cl b/amd-builtins/geom/distance.cl new file mode 100644 index 0000000..78147ac --- /dev/null +++ b/amd-builtins/geom/distance.cl
@@ -0,0 +1,70 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, weak,always_inline)) float +distance(float p0, float p1) +{ + return length(p0 - p1); +} + +__attribute__((overloadable, weak,always_inline)) double +distance(double p0, double p1) +{ + return length(p0 - p1); +} + +__attribute__((overloadable, weak,always_inline)) float +distance(float2 p0, float2 p1) +{ + return length(p0 - p1); +} + +__attribute__((overloadable, weak,always_inline)) double +distance(double2 p0, double2 p1) +{ + return length(p0 - p1); +} + +__attribute__((overloadable, weak,always_inline)) float +distance(float3 p0, float3 p1) +{ + return length(p0 - p1); +} + +__attribute__((overloadable, weak,always_inline)) double +distance(double3 p0, double3 p1) +{ + return length(p0 - p1); +} + +__attribute__((overloadable, weak,always_inline)) float +distance(float4 p0, float4 p1) +{ + return length(p0 - p1); +} + +__attribute__((overloadable, weak,always_inline)) double +distance(double4 p0, double4 p1) +{ + return length(p0 - p1); +} +
diff --git a/amd-builtins/geom/dot.cl b/amd-builtins/geom/dot.cl new file mode 100644 index 0000000..3271cbb --- /dev/null +++ b/amd-builtins/geom/dot.cl
@@ -0,0 +1,75 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, weak, always_inline)) float +dot(float p0, float p1) +{ + return p0 * p1; +} + +__attribute__((overloadable, weak, always_inline)) float +dot(float2 p0, float2 p1) +{ + float2 p = p0 * p1; + return p.x + p.y; +} + +__attribute__((overloadable, weak, always_inline)) float +dot(float3 p0, float3 p1) +{ + float3 p = p0 * p1; + return p.x + p.y + p.z; +} + +__attribute__((overloadable, weak, always_inline)) float +dot(float4 p0, float4 p1) +{ + float4 p = p0 * p1; + return p.x + p.y + p.z + p.w; +} + +__attribute__((overloadable, weak, always_inline)) double +dot(double p0, double p1) +{ + return p0 * p1; +} + +__attribute__((overloadable, weak, always_inline)) double +dot(double2 p0, double2 p1) +{ + double2 p = p0 * p1; + return p.x + p.y; +} + +__attribute__((overloadable, weak, always_inline)) double +dot(double3 p0, double3 p1) +{ + double3 p = p0 * p1; + return p.x + p.y + p.z; +} + +__attribute__((overloadable, weak, always_inline)) double +dot(double4 p0, double4 p1) +{ + double4 p = p0 * p1; + return p.x + p.y + p.z + p.w; +}
diff --git a/amd-builtins/geom/fast_distance.cl b/amd-builtins/geom/fast_distance.cl new file mode 100644 index 0000000..434451b --- /dev/null +++ b/amd-builtins/geom/fast_distance.cl
@@ -0,0 +1,46 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, weak,always_inline)) float +fast_distance(float p0, float p1) +{ + return fast_length(p0 - p1); +} + +__attribute__((overloadable, weak,always_inline)) float +fast_distance(float2 p0, float2 p1) +{ + return fast_length(p0 - p1); +} + +__attribute__((overloadable, weak,always_inline)) float +fast_distance(float3 p0, float3 p1) +{ + return fast_length(p0 - p1); +} + +__attribute__((overloadable, weak,always_inline)) float +fast_distance(float4 p0, float4 p1) +{ + return fast_length(p0 - p1); +} +
diff --git a/amd-builtins/geom/fast_length.cl b/amd-builtins/geom/fast_length.cl new file mode 100644 index 0000000..5d2a27e --- /dev/null +++ b/amd-builtins/geom/fast_length.cl
@@ -0,0 +1,46 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, weak,always_inline)) float +fast_length(float p) +{ + return fabs(p); +} + +__attribute__((overloadable, weak,always_inline)) float +fast_length(float2 p) +{ + return half_sqrt(dot(p, p)); +} + +__attribute__((overloadable, weak,always_inline)) float +fast_length(float3 p) +{ + return half_sqrt(dot(p, p)); +} + +__attribute__((overloadable, weak,always_inline)) float +fast_length(float4 p) +{ + return half_sqrt(dot(p, p)); +} +
diff --git a/amd-builtins/geom/fast_normalize.cl b/amd-builtins/geom/fast_normalize.cl new file mode 100644 index 0000000..feb45b9 --- /dev/null +++ b/amd-builtins/geom/fast_normalize.cl
@@ -0,0 +1,49 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, weak,always_inline)) float +fast_normalize(float p) +{ + return normalize(p); +} + +__attribute__((overloadable, weak,always_inline)) float2 +fast_normalize(float2 p) +{ + float l2 = dot(p, p); + return l2 == 0.0F ? p : p * half_rsqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) float3 +fast_normalize(float3 p) +{ + float l2 = dot(p, p); + return l2 == 0.0F ? p : p * half_rsqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) float4 +fast_normalize(float4 p) +{ + float l2 = dot(p, p); + return l2 == 0.0F ? p : p * half_rsqrt(l2); +} +
diff --git a/amd-builtins/geom/length.cl b/amd-builtins/geom/length.cl new file mode 100644 index 0000000..1570fc3 --- /dev/null +++ b/amd-builtins/geom/length.cl
@@ -0,0 +1,132 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, weak,always_inline)) float +length(float p) +{ + return fabs(p); +} + +__attribute__((overloadable, weak,always_inline)) double +length(double p) +{ + return fabs(p); +} + +__attribute__((overloadable, weak,always_inline)) float +length(float2 p) +{ + float l2 = dot(p, p); + + if (l2 < FLT_MIN) { + p *= 0x1.0p+86F; + return sqrt(dot(p, p)) * 0x1.0p-86F; + } else if (l2 == INFINITY) { + p *= 0x1.0p-65F; + return sqrt(dot(p, p)) * 0x1.0p+65F; + } + + return sqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) double +length(double2 p) +{ + double l2 = dot(p, p); + + if (l2 < DBL_MIN) { + p *= 0x1.0p+563; + return sqrt(dot(p, p)) * 0x1.0p-563; + } else if (l2 == INFINITY) { + p *= 0x1.0p-513; + return sqrt(dot(p, p)) * 0x1.0p+513; + } + + return sqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) float +length(float3 p) +{ + float l2 = dot(p, p); + + if (l2 < FLT_MIN) { + p *= 0x1.0p+86F; + return sqrt(dot(p, p)) * 0x1.0p-86F; + } else if (l2 == INFINITY) { + p *= 0x1.0p-66F; + return sqrt(dot(p, p)) * 0x1.0p+66F; + } + + return sqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) double +length(double3 p) +{ + double l2 = dot(p, p); + + if (l2 < DBL_MIN) { + p *= 0x1.0p+563; + return sqrt(dot(p, p)) * 0x1.0p-563; + } else if (l2 == INFINITY) { + p *= 0x1.0p-514; + return sqrt(dot(p, p)) * 0x1.0p+514; + } + + return sqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) float +length(float4 p) +{ + float l2 = dot(p, p); + + if (l2 < FLT_MIN) { + p *= 0x1.0p+86F; + return sqrt(dot(p, p)) * 0x1.0p-86F; + } + else if (l2 == INFINITY) { + p *= 0x1.0p-66f; + return sqrt(dot(p, p)) * 0x1.0p+66F; + } + + return sqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) double +length(double4 p) +{ + double l2 = dot(p, p); + + if (l2 < DBL_MIN) { + p *= 0x1.0p+563; + return sqrt(dot(p, p)) * 0x1.0p-563; + } + else if (l2 == INFINITY) { + p *= 0x1.0p-514; + return sqrt(dot(p, p)) * 0x1.0p+514; + } + + return sqrt(l2); +} +
diff --git a/amd-builtins/geom/mix.cl b/amd-builtins/geom/mix.cl new file mode 100644 index 0000000..0ac7f9b --- /dev/null +++ b/amd-builtins/geom/mix.cl
@@ -0,0 +1,39 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +//extern __attribute__((pure)) float __amdil_lerp_f32(float, float, float); + +// TODO_HSA: Validate that fma works for mix +__attribute__((overloadable, weak,always_inline)) float +mix(float x, float y, float a) +{ + //return __amdil_lerp_f32(a, y, x); + return fma(a, (y - x), x); +} + +__attribute__((overloadable, weak,always_inline)) double +mix(double x, double y, double a) +{ + //return x + (y - x) * a; + return fma(a, (y - x), x); +} +
diff --git a/amd-builtins/geom/normalize.cl b/amd-builtins/geom/normalize.cl new file mode 100644 index 0000000..774892b --- /dev/null +++ b/amd-builtins/geom/normalize.cl
@@ -0,0 +1,166 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, weak,always_inline)) float +normalize(float p) +{ + return sign(p); +} + +__attribute__((overloadable, weak,always_inline)) double +normalize(double p) +{ + return sign(p); +} + +__attribute__((overloadable, weak,always_inline)) float2 +normalize(float2 p) +{ + if (all(p == (float2)0.0F)) + return p; + + float l2 = dot(p, p); + + if (l2 < FLT_MIN) { + p *= 0x1.0p+86F; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-65f; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((float2)0.0F, (float2)1.0F, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) double2 +normalize(double2 p) +{ + if (all(p == (double2)0.0)) + return p; + + double l2 = dot(p, p); + + if (l2 < DBL_MIN) { + p *= 0x1.0p+563; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-513; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((double2)0.0, (double2)1.0, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) float3 +normalize(float3 p) +{ + if (all(p == (float3)0.0F)) + return p; + + float l2 = dot(p, p); + + if (l2 < FLT_MIN) { + p *= 0x1.0p+86F; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-66f; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((float3)0.0F, (float3)1.0F, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) double3 +normalize(double3 p) +{ + if (all(p == (double3)0.0)) + return p; + + double l2 = dot(p, p); + + if (l2 < DBL_MIN) { + p *= 0x1.0p+563; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-514; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((double3)0.0, (double3)1.0, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) float4 +normalize(float4 p) +{ + if (all(p == (float4)0.0F)) + return p; + + float l2 = dot(p, p); + + if (l2 < FLT_MIN) { + p *= 0x1.0p+86F; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-66f; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((float4)0.0F, (float4)1.0F, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} + +__attribute__((overloadable, weak,always_inline)) double4 +normalize(double4 p) +{ + if (all(p == (double4)0.0)) + return p; + + double l2 = dot(p, p); + + if (l2 < DBL_MIN) { + p *= 0x1.0p+563; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-514; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((double4)0.0, (double4)1.0, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} +
diff --git a/amd-builtins/geom/sign.cl b/amd-builtins/geom/sign.cl new file mode 100644 index 0000000..db5c692 --- /dev/null +++ b/amd-builtins/geom/sign.cl
@@ -0,0 +1,37 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define G(T) \ +__attribute__((overloadable, weak, always_inline)) T \ +sign(T x) \ +{ \ + return copysign(x == (T)0 | isnan(x) ? (T)0 : (T)1, x); \ +} + +G(float) +// TODO_HSA: resolve vector expansions +//G(float2) +//G(float3) +//G(float4) + +G(double) +
diff --git a/amd-builtins/geom/step.cl b/amd-builtins/geom/step.cl new file mode 100644 index 0000000..dcff4df --- /dev/null +++ b/amd-builtins/geom/step.cl
@@ -0,0 +1,48 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, weak,always_inline)) float +step(float edge, float x) +{ + return x < edge ? 0.0F: 1.0F; +} + +__attribute__((overloadable, weak,always_inline)) double +step(double edge, double x) +{ + return x < edge ? 0.0: 1.0; +} + +__attribute__((overloadable, weak,always_inline)) float +smoothstep(float edge0, float edge1, float x) +{ + float t = clamp((x - edge0) / (edge1 - edge0), 0.0F, 1.0F); + return t * t * (3.0F - 2.0F * t); +} + +__attribute__((overloadable, weak,always_inline)) double +smoothstep(double edge0, double edge1, double x) +{ + double t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0); + return t * t * (3.0 - 2.0 * t); +} +
diff --git a/amd-builtins/image/get.cl b/amd-builtins/image/get.cl new file mode 100644 index 0000000..ba62507 --- /dev/null +++ b/amd-builtins/image/get.cl
@@ -0,0 +1,307 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +// Image query built-ins + +#if __OPENCL_C_VERSION__ >= 200 +#define CLK_UNORM_INT24 0x10DF +// BRIG enum should match the one in \compiler\hsail-tools\libHSAIL\Brig_new.hpp +// TODO : We need to have a single file header with those enums shared across components +enum BrigImageChannelOrder { + //.mnemo={ s/^BRIG_CHANNEL_ORDER_?//;lc } + //.mnemo_token=EImageOrder + //.mnemo_context=EImageOrderContext + BRIG_CHANNEL_ORDER_A = 0, + BRIG_CHANNEL_ORDER_R = 1, + BRIG_CHANNEL_ORDER_RX = 2, + BRIG_CHANNEL_ORDER_RG = 3, + BRIG_CHANNEL_ORDER_RGX = 4, + BRIG_CHANNEL_ORDER_RA = 5, + BRIG_CHANNEL_ORDER_RGB = 6, + BRIG_CHANNEL_ORDER_RGBX = 7, + BRIG_CHANNEL_ORDER_RGBA = 8, + BRIG_CHANNEL_ORDER_BGRA = 9, + BRIG_CHANNEL_ORDER_ARGB = 10, + BRIG_CHANNEL_ORDER_ABGR = 11, + BRIG_CHANNEL_ORDER_SRGB = 12, + BRIG_CHANNEL_ORDER_SRGBX = 13, + BRIG_CHANNEL_ORDER_SRGBA = 14, + BRIG_CHANNEL_ORDER_SBGRA = 15, + BRIG_CHANNEL_ORDER_INTENSITY = 16, + BRIG_CHANNEL_ORDER_LUMINANCE = 17, + BRIG_CHANNEL_ORDER_DEPTH = 18, + BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19 +}; + +enum BrigImageChannelType { + //.mnemo={ s/^BRIG_CHANNEL_TYPE_//;lc } + //.mnemo_token=EImageFormat + BRIG_CHANNEL_TYPE_SNORM_INT8 = 0, + BRIG_CHANNEL_TYPE_SNORM_INT16 = 1, + BRIG_CHANNEL_TYPE_UNORM_INT8 = 2, + BRIG_CHANNEL_TYPE_UNORM_INT16 = 3, + BRIG_CHANNEL_TYPE_UNORM_INT24 = 4, + BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + BRIG_CHANNEL_TYPE_UNORM_SHORT_101010 = 7, + BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8, + BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9, + BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10, + BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + BRIG_CHANNEL_TYPE_HALF_FLOAT = 14, + BRIG_CHANNEL_TYPE_FLOAT = 15 +}; +#endif + +// Hsail image query intrinsics +extern __attribute__((pure)) int __hsail_query_width_1d(image1d_t); +extern __attribute__((pure)) int __hsail_query_width_1db(image1d_buffer_t); +extern __attribute__((pure)) int __hsail_query_width_1da(image1d_array_t); +extern __attribute__((pure)) int __hsail_query_width_2d(image2d_t); +extern __attribute__((pure)) int __hsail_query_width_2da(image2d_array_t); +extern __attribute__((pure)) int __hsail_query_width_3d(image3d_t); + +extern __attribute__((pure)) int __hsail_query_height_2d(image2d_t); +extern __attribute__((pure)) int __hsail_query_height_2da(image2d_array_t); +extern __attribute__((pure)) int __hsail_query_height_3d(image3d_t); + +extern __attribute__((pure)) int __hsail_depth_3d(image3d_t); + +extern __attribute__((pure)) int __hsail_query_format_1d(image1d_t); +extern __attribute__((pure)) int __hsail_query_format_1db(image1d_buffer_t); +extern __attribute__((pure)) int __hsail_query_format_1da(image1d_array_t); +extern __attribute__((pure)) int __hsail_query_format_2d(image2d_t); +extern __attribute__((pure)) int __hsail_query_format_2da(image2d_array_t); +extern __attribute__((pure)) int __hsail_query_format_3d(image3d_t); + +extern __attribute__((pure)) int __hsail_query_order_1d(image1d_t); +extern __attribute__((pure)) int __hsail_query_order_1db(image1d_buffer_t); +extern __attribute__((pure)) int __hsail_query_order_1da(image1d_array_t); +extern __attribute__((pure)) int __hsail_query_order_2d(image2d_t); +extern __attribute__((pure)) int __hsail_query_order_2da(image2d_array_t); +extern __attribute__((pure)) int __hsail_query_order_3d(image3d_t); + +extern __attribute__((pure)) uint __hsail_query_array_1da(image1d_array_t); +extern __attribute__((pure)) uint __hsail_query_array_2da(image2d_array_t); + + +#define DefQueryImage(Func,HsailIntrin,ImageTy,RetTy) \ +__attribute__((overloadable, always_inline)) RetTy \ +Func(ImageTy image) { \ + return (RetTy)HsailIntrin(image); \ +} + +#if __OPENCL_C_VERSION__ >= 200 +#define DefQueryImageChOrder(Func,HsailIntrin,ImageTy,RetTy) \ +__attribute__((overloadable, always_inline)) RetTy \ +Func(ImageTy image) { \ + uint Chorder = (RetTy)HsailIntrin(image); \ + return (mapBRIGChOrderToOCLChOrder(Chorder)); \ +} + +#define DefQueryImageChType(Func,HsailIntrin,ImageTy,RetTy) \ +__attribute__((overloadable, always_inline)) RetTy \ +Func(ImageTy image) { \ + uint Chtype = (RetTy)HsailIntrin(image); \ + return (mapBRIGChTypeToOCLChType(Chtype)); \ +} + +static inline uint mapBRIGChOrderToOCLChOrder(uint BRIGChOrder) { + uint chorder; + switch (BRIGChOrder) { + case BRIG_CHANNEL_ORDER_A: chorder = CLK_A; break; + case BRIG_CHANNEL_ORDER_R: chorder = CLK_R; break; + case BRIG_CHANNEL_ORDER_RX: chorder = CLK_Rx; break; + case BRIG_CHANNEL_ORDER_RG: chorder = CLK_RG; break; + case BRIG_CHANNEL_ORDER_RGX: chorder = CLK_RGx; break; + case BRIG_CHANNEL_ORDER_RA: chorder = CLK_RA; break; + case BRIG_CHANNEL_ORDER_RGB: chorder = CLK_RGB; break; + case BRIG_CHANNEL_ORDER_RGBX: chorder = CLK_RGBx; break; + case BRIG_CHANNEL_ORDER_RGBA: chorder = CLK_RGBA; break; + case BRIG_CHANNEL_ORDER_BGRA: chorder = CLK_BGRA; break; + case BRIG_CHANNEL_ORDER_ARGB: chorder = CLK_ARGB; break; + case BRIG_CHANNEL_ORDER_SRGB: chorder = CLK_sRGB; break; + case BRIG_CHANNEL_ORDER_SRGBX: chorder = CLK_sRGBx; break; + case BRIG_CHANNEL_ORDER_SRGBA: chorder = CLK_sRGBA; break; + case BRIG_CHANNEL_ORDER_SBGRA: chorder = CLK_sBGRA; break; + case BRIG_CHANNEL_ORDER_INTENSITY: chorder = CLK_INTENSITY; break; + case BRIG_CHANNEL_ORDER_LUMINANCE: chorder = CLK_LUMINANCE; break; + case BRIG_CHANNEL_ORDER_DEPTH: chorder = CLK_DEPTH; break; + case BRIG_CHANNEL_ORDER_DEPTH_STENCIL: chorder = CLK_DEPTH_STENCIL; break; + } + return chorder; +} + +static inline uint mapBRIGChTypeToOCLChType(uint BRIGChType) { + uint chtype; + switch (BRIGChType) { + case BRIG_CHANNEL_TYPE_SNORM_INT8: chtype = CLK_SNORM_INT8; break; + case BRIG_CHANNEL_TYPE_SNORM_INT16: chtype = CLK_SNORM_INT16; break; + case BRIG_CHANNEL_TYPE_UNORM_INT8: chtype = CLK_UNORM_INT8; break; + case BRIG_CHANNEL_TYPE_UNORM_INT16: chtype = CLK_UNORM_INT16; break; + case BRIG_CHANNEL_TYPE_UNORM_INT24: chtype = CLK_UNORM_INT24; break; + case BRIG_CHANNEL_TYPE_UNORM_SHORT_555: chtype = CLK_UNORM_SHORT_555; break; + case BRIG_CHANNEL_TYPE_UNORM_SHORT_565: chtype = CLK_UNORM_SHORT_565; break; + // Todo: Need to change *_UNORM_SHORT_101010 to *_UNORM_INT_101010 once BRIG enum will change + case BRIG_CHANNEL_TYPE_UNORM_SHORT_101010: chtype = CLK_UNORM_INT_101010; break; + case BRIG_CHANNEL_TYPE_SIGNED_INT8: chtype = CLK_SIGNED_INT8; break; + case BRIG_CHANNEL_TYPE_SIGNED_INT16: chtype = CLK_SIGNED_INT16; break; + case BRIG_CHANNEL_TYPE_SIGNED_INT32: chtype = CLK_SIGNED_INT32; break; + case BRIG_CHANNEL_TYPE_UNSIGNED_INT8: chtype = CLK_UNSIGNED_INT8; break; + case BRIG_CHANNEL_TYPE_UNSIGNED_INT16: chtype = CLK_UNSIGNED_INT16; break; + case BRIG_CHANNEL_TYPE_UNSIGNED_INT32: chtype = CLK_UNSIGNED_INT32; break; + case BRIG_CHANNEL_TYPE_HALF_FLOAT: chtype = CLK_HALF_FLOAT; break; + case BRIG_CHANNEL_TYPE_FLOAT: chtype = CLK_FLOAT; break; + } + return chtype; +} +#endif + +DefQueryImage(get_image_width, __hsail_query_width_1d, image1d_t, int) +DefQueryImage(get_image_width, __hsail_query_width_1db, image1d_buffer_t, int) +DefQueryImage(get_image_width, __hsail_query_width_1da, image1d_array_t, int) +DefQueryImage(get_image_width, __hsail_query_width_2d, image2d_t, int) +DefQueryImage(get_image_width, __hsail_query_width_2da, image2d_array_t, int) +DefQueryImage(get_image_width, __hsail_query_width_3d, image3d_t, int) + +DefQueryImage(get_image_height, __hsail_query_height_2d, image2d_t, int) +DefQueryImage(get_image_height, __hsail_query_height_2da, image2d_array_t, int) +DefQueryImage(get_image_height, __hsail_query_height_3d, image3d_t, int) + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) int +get_image_depth(image3d_t image) { + return __hsail_depth_3d(image); +} + +#if __OPENCL_C_VERSION__ >= 200 +DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_1d, image1d_t, int) +DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_1db, image1d_buffer_t, int) +DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_1da, image1d_array_t, int) +DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_2d, image2d_t, int) +DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_2da, image2d_array_t, int) +DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_3d, image3d_t, int) + +DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_1d, image1d_t, int) +DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_1db, image1d_buffer_t, int) +DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_1da, image1d_array_t, int) +DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_2d, image2d_t, int) +DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_2da, image2d_array_t, int) +DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_3d, image3d_t, int) + +#else +DefQueryImage(get_image_channel_data_type, __hsail_query_format_1d, image1d_t, int) +DefQueryImage(get_image_channel_data_type, __hsail_query_format_1db, image1d_buffer_t, int) +DefQueryImage(get_image_channel_data_type, __hsail_query_format_1da, image1d_array_t, int) +DefQueryImage(get_image_channel_data_type, __hsail_query_format_2d, image2d_t, int) +DefQueryImage(get_image_channel_data_type, __hsail_query_format_2da, image2d_array_t, int) +DefQueryImage(get_image_channel_data_type, __hsail_query_format_3d, image3d_t, int) + +DefQueryImage(get_image_channel_order, __hsail_query_order_1d, image1d_t, int) +DefQueryImage(get_image_channel_order, __hsail_query_order_1db, image1d_buffer_t, int) +DefQueryImage(get_image_channel_order, __hsail_query_order_1da, image1d_array_t, int) +DefQueryImage(get_image_channel_order, __hsail_query_order_2d, image2d_t, int) +DefQueryImage(get_image_channel_order, __hsail_query_order_2da, image2d_array_t, int) +DefQueryImage(get_image_channel_order, __hsail_query_order_3d, image3d_t, int) +#endif + +__attribute__((overloadable, always_inline)) int2 +get_image_dim(image2d_t image) { + int2 dim; + dim.x = get_image_width(image); + dim.y = get_image_height(image); + return dim; +} + +__attribute__((overloadable, always_inline)) int2 +get_image_dim(image2d_array_t image) { + int2 dim; + dim.x = get_image_width(image); + dim.y = get_image_height(image); + return dim; +} + +__attribute__((overloadable, always_inline)) int4 +get_image_dim(image3d_t image) { + int4 dim; + dim.x = get_image_width(image); + dim.y = get_image_height(image); + dim.z = get_image_depth(image); + dim.w = 0; + return dim; +} + +DefQueryImage(get_image_array_size, __hsail_query_array_1da, image1d_array_t, size_t); +DefQueryImage(get_image_array_size, __hsail_query_array_2da, image2d_array_t, size_t); + +#if __OPENCL_C_VERSION__ >= 200 +// Image-2.0 query built-ins + +// Hsail image query intrinsics +extern __attribute__((pure)) int __hsail_query_width_2ddepth(image2d_depth_t); +extern __attribute__((pure)) int __hsail_query_width_2dadepth(image2d_array_depth_t); + +extern __attribute__((pure)) int __hsail_query_height_2ddepth(image2d_depth_t); +extern __attribute__((pure)) int __hsail_query_height_2dadepth(image2d_array_depth_t); + +extern __attribute__((pure)) int __hsail_query_array_2dadepth(image2d_array_depth_t); + +extern __attribute__((pure)) int __hsail_query_channelorder_2ddepth(image2d_depth_t); +extern __attribute__((pure)) int __hsail_query_channelorder_2dadepth(image2d_array_depth_t); + +extern __attribute__((pure)) int __hsail_query_channeltype_2ddepth(image2d_depth_t); +extern __attribute__((pure)) int __hsail_query_channeltype_2dadepth(image2d_array_depth_t); + +DefQueryImage(get_image_width, __hsail_query_width_2ddepth, image2d_depth_t, int) +DefQueryImage(get_image_width, __hsail_query_width_2dadepth, image2d_array_depth_t, int) + +DefQueryImage(get_image_height, __hsail_query_height_2ddepth, image2d_depth_t, int) +DefQueryImage(get_image_height, __hsail_query_height_2dadepth, image2d_array_depth_t, int) + +DefQueryImageChType(get_image_channel_data_type, __hsail_query_channeltype_2ddepth, image2d_depth_t, int) +DefQueryImageChType(get_image_channel_data_type, __hsail_query_channeltype_2dadepth, image2d_array_depth_t, int) + +DefQueryImageChOrder(get_image_channel_order, __hsail_query_channelorder_2ddepth, image2d_depth_t, int) +DefQueryImageChOrder(get_image_channel_order, __hsail_query_channelorder_2dadepth, image2d_array_depth_t, int) + + +__attribute__((overloadable, always_inline)) int2 +get_image_dim(image2d_depth_t image) { + int2 dim; + dim.x = get_image_width(image); + dim.y = get_image_height(image); + return dim; +} +__attribute__((overloadable, always_inline)) int2 +get_image_dim(image2d_array_depth_t image) { + int2 dim; + dim.x = get_image_width(image); + dim.y = get_image_height(image); + return dim; +} + +DefQueryImage(get_image_array_size, __hsail_query_array_2dadepth, image2d_array_depth_t, size_t) + +#endif // __OPENCL_C_VERSION__ >= 200
diff --git a/amd-builtins/image/read.cl b/amd-builtins/image/read.cl new file mode 100644 index 0000000..0f7ae0b --- /dev/null +++ b/amd-builtins/image/read.cl
@@ -0,0 +1,312 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// Image read built-ins + +struct pixel_data_f32 { + float x; + float y; + float z; + float w; +}; + +struct pixel_data_s32 { + int x; + int y; + int z; + int w; +}; + +struct pixel_data_u32 { + uint x; + uint y; + uint z; + uint w; +}; + +// Read Image 1d + +#define DefReadImage1D(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \ + extern struct pixel_data##DstSuf \ + HsailIntrin(image1d_t, sampler_t, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy##4 \ + Func(image1d_t image, sampler_t sampler, CoordTy coord) { \ + struct pixel_data##DstSuf \ + data = HsailIntrin(image, sampler, coord); \ + return (DstTy##4)(data.x, data.y, data.z, data.w); \ + } + +DefReadImage1D(read_imagef, __hsail_rdimagef_1d_s32, float, int, _f32) +DefReadImage1D(read_imagef, __hsail_rdimagef_1d_f32, float, float, _f32) +DefReadImage1D(read_imagei, __hsail_rdimagei_1d_s32, int, int, _s32) +DefReadImage1D(read_imagei, __hsail_rdimagei_1d_f32, int, float, _s32) +DefReadImage1D(read_imageui, __hsail_rdimageui_1d_s32, uint, int, _u32) +DefReadImage1D(read_imageui, __hsail_rdimageui_1d_f32, uint, float, _u32) + +// Read Image 1d Array + +#define DefReadImage1DArray(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \ + extern struct pixel_data##DstSuf \ + HsailIntrin(image1d_array_t, sampler_t, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy##4 \ + Func(image1d_array_t image, sampler_t sampler, CoordTy##2 coords) { \ + struct pixel_data##DstSuf \ + data = HsailIntrin(image, sampler, coords.x, coords.y); \ + return (DstTy##4)(data.x, data.y, data.z, data.w); \ + } + +DefReadImage1DArray(read_imagef, __hsail_rdimagef_1da_s32, float, int, _f32) +DefReadImage1DArray(read_imagef, __hsail_rdimagef_1da_f32, float, float, _f32) +DefReadImage1DArray(read_imagei, __hsail_rdimagei_1da_s32, int, int, _s32) +DefReadImage1DArray(read_imagei, __hsail_rdimagei_1da_f32, int, float, _s32) +DefReadImage1DArray(read_imageui, __hsail_rdimageui_1da_s32, uint, int, _u32) +DefReadImage1DArray(read_imageui, __hsail_rdimageui_1da_f32, uint, float, _u32) + +// Read Image 2d + +#define DefReadImage2D(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \ + extern struct pixel_data##DstSuf \ + HsailIntrin(image2d_t, sampler_t, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy##4 \ + Func(image2d_t image, sampler_t sampler, CoordTy##2 coords) { \ + struct pixel_data##DstSuf \ + data = HsailIntrin(image, sampler, coords.x, coords.y); \ + return (DstTy##4)(data.x, data.y, data.z, data.w); \ +} + +DefReadImage2D(read_imagef, __hsail_rdimagef_2d_s32, float, int, _f32) +DefReadImage2D(read_imagef, __hsail_rdimagef_2d_f32, float, float, _f32) +DefReadImage2D(read_imagei, __hsail_rdimagei_2d_s32, int, int, _s32) +DefReadImage2D(read_imagei, __hsail_rdimagei_2d_f32, int, float, _s32) +DefReadImage2D(read_imageui, __hsail_rdimageui_2d_s32, uint, int, _u32) +DefReadImage2D(read_imageui, __hsail_rdimageui_2d_f32, uint, float, _u32) + +// Read Image 2d Array + +#define DefReadImage2DArray(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \ + extern struct pixel_data##DstSuf \ + HsailIntrin(image2d_array_t, sampler_t, CoordTy, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy##4 \ + Func(image2d_array_t image, sampler_t sampler, CoordTy##4 coords) { \ + struct pixel_data##DstSuf \ + data = HsailIntrin(image, sampler, coords.x, coords.y, coords.z); \ + return (DstTy##4)(data.x, data.y, data.z, data.w); \ + } + +DefReadImage2DArray(read_imagef, __hsail_rdimagef_2da_s32, float, int, _f32) +DefReadImage2DArray(read_imagef, __hsail_rdimagef_2da_f32, float, float, _f32) +DefReadImage2DArray(read_imagei, __hsail_rdimagei_2da_s32, int, int, _s32) +DefReadImage2DArray(read_imagei, __hsail_rdimagei_2da_f32, int, float, _s32) +DefReadImage2DArray(read_imageui, __hsail_rdimageui_2da_s32, uint, int, _u32) +DefReadImage2DArray(read_imageui, __hsail_rdimageui_2da_f32, uint, float, _u32) + +// Read Image 3d + +#define DefReadImage3D(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \ + extern struct pixel_data##DstSuf \ + HsailIntrin(image3d_t, sampler_t, CoordTy, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy##4 \ + Func(image3d_t image, sampler_t sampler, CoordTy##4 coords) { \ + struct pixel_data##DstSuf \ + data = HsailIntrin(image, sampler, coords.x, coords.y, coords.z); \ + return (DstTy##4)(data.x, data.y, data.z, data.w); \ + } + +DefReadImage3D(read_imagef, __hsail_rdimagef_3d_s32, float, int, _f32) +DefReadImage3D(read_imagef, __hsail_rdimagef_3d_f32, float, float, _f32) +DefReadImage3D(read_imagei, __hsail_rdimagei_3d_s32, int, int, _s32) +DefReadImage3D(read_imagei, __hsail_rdimagei_3d_f32, int, float, _s32) +DefReadImage3D(read_imageui, __hsail_rdimageui_3d_s32, uint, int, _u32) +DefReadImage3D(read_imageui, __hsail_rdimageui_3d_f32, uint, float, _u32) + +// Sampler-less Read Image 1d + +#define DefLoadImage1D(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \ + extern struct pixel_data##DstSuf \ + HsailIntrin(image1d_t, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy##4 \ + Func(image1d_t image, CoordTy coord) { \ + struct pixel_data##DstSuf \ + data = HsailIntrin(image, coord); \ + return (DstTy##4)(data.x, data.y, data.z, data.w); \ + } + +DefLoadImage1D(read_imagef, __hsail_ldimagef_1d_u32, float, int, _f32) +DefLoadImage1D(read_imagei, __hsail_ldimagei_1d_u32, int, int, _s32) +DefLoadImage1D(read_imageui, __hsail_ldimageui_1d_u32, uint, int, _u32) + +// Sampler-less Read Image 1d buffer + +#define DefLoadImage1DBuffer(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \ + extern struct pixel_data##DstSuf \ + HsailIntrin(image1d_buffer_t, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy##4 \ + Func(image1d_buffer_t image, CoordTy coord) { \ + struct pixel_data##DstSuf \ + data = HsailIntrin(image, coord); \ + return (DstTy##4)(data.x, data.y, data.z, data.w); \ + } + +DefLoadImage1DBuffer(read_imagef, __hsail_ldimagef_1db_u32, float, int, _f32) +DefLoadImage1DBuffer(read_imagei, __hsail_ldimagei_1db_u32, int, int, _s32) +DefLoadImage1DBuffer(read_imageui, __hsail_ldimageui_1db_u32, uint, int, _u32) + +// Sampler-less Read Image 1d Array + +#define DefLoadImage1DArray(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \ + extern struct pixel_data##DstSuf \ + HsailIntrin(image1d_array_t, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy##4 \ + Func(image1d_array_t image, CoordTy##2 coords) { \ + struct pixel_data##DstSuf \ + data = HsailIntrin(image, coords.x, coords.y); \ + return (DstTy##4)(data.x, data.y, data.z, data.w); \ + } + +DefLoadImage1DArray(read_imagef, __hsail_ldimagef_1da_u32, float, int, _f32) +DefLoadImage1DArray(read_imagei, __hsail_ldimagei_1da_u32, int, int, _s32) +DefLoadImage1DArray(read_imageui, __hsail_ldimageui_1da_u32, uint, int, _u32) + +// Sampler-less Read Image 2d + +#define DefLoadImage2D(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \ + extern struct pixel_data##DstSuf \ + HsailIntrin(image2d_t, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy##4 \ + Func(image2d_t image, CoordTy##2 coords) { \ + struct pixel_data##DstSuf \ + data = HsailIntrin(image, coords.x, coords.y); \ + return (DstTy##4)(data.x, data.y, data.z, data.w); \ + } + +DefLoadImage2D(read_imagef, __hsail_ldimagef_2d_u32, float, int, _f32) +DefLoadImage2D(read_imagei, __hsail_ldimagei_2d_u32, int, int, _s32) +DefLoadImage2D(read_imageui, __hsail_ldimageui_2d_u32, uint, int, _u32) + +// Sampler-less Read Image 2d array + +#define DefLoadImage2DArray(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \ + extern struct pixel_data##DstSuf \ + HsailIntrin(image2d_array_t, CoordTy, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy##4 \ + Func(image2d_array_t image, CoordTy##4 coords) { \ + struct pixel_data##DstSuf \ + data = HsailIntrin(image, coords.x, coords.y, coords.z); \ + return (DstTy##4)(data.x, data.y, data.z, data.w); \ + } + +DefLoadImage2DArray(read_imagef, __hsail_ldimagef_2da_u32, float, int, _f32) +DefLoadImage2DArray(read_imagei, __hsail_ldimagei_2da_u32, int, int, _s32) +DefLoadImage2DArray(read_imageui, __hsail_ldimageui_2da_u32, uint, int, _u32) + +// Sampler-less Read Image 3d + +#define DefLoadImage3D(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \ + extern struct pixel_data##DstSuf \ + HsailIntrin(image3d_t, CoordTy, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy##4 \ + Func(image3d_t image, CoordTy##4 coords) { \ + struct pixel_data##DstSuf \ + data = HsailIntrin(image, coords.x, coords.y, coords.z); \ + return (DstTy##4)(data.x, data.y, data.z, data.w); \ + } + +DefLoadImage3D(read_imagef, __hsail_ldimagef_3d_u32, float, int, _f32) +DefLoadImage3D(read_imagei, __hsail_ldimagei_3d_u32, int, int, _s32) +DefLoadImage3D(read_imageui, __hsail_ldimageui_3d_u32, uint, int, _u32) + +#if __OPENCL_C_VERSION__ >= 200 +// Image-2.0 read built-ins + +// Read Image 2d depth +#define DefReadImage2DDepth(Func, HsailIntrin, DstTy, CoordTy) \ + float \ + HsailIntrin(image2d_depth_t, sampler_t, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy \ + Func(image2d_depth_t image, sampler_t sampler, CoordTy##2 coords) { \ + float \ + data = HsailIntrin(image, sampler, coords.x, coords.y); \ + return data; \ +} + +DefReadImage2DDepth(read_imagef, __hsail_rdimagef_2ddepth_s32, float, int) +DefReadImage2DDepth(read_imagef, __hsail_rdimagef_2ddepth_f32, float, float) + +// Read Image 2d array depth + +#define DefReadImage2DArrayDepth(Func, HsailIntrin, DstTy, CoordTy) \ + float \ + HsailIntrin(image2d_array_depth_t, sampler_t, CoordTy, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy \ + Func(image2d_array_depth_t image, sampler_t sampler, CoordTy##4 coords) { \ + float \ + data = HsailIntrin(image, sampler, coords.x, coords.y, coords.z); \ + return data; \ +} + +DefReadImage2DArrayDepth(read_imagef, __hsail_rdimagef_2dadepth_s32, float, int) +DefReadImage2DArrayDepth(read_imagef, __hsail_rdimagef_2dadepth_f32, float, float) + + +// Sampler-less Read Image 2d depth +#define DefLoadImage2DDepth(Func, HsailIntrin, DstTy, CoordTy) \ + float \ + HsailIntrin(image2d_depth_t, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy \ + Func(image2d_depth_t image, CoordTy##2 coords) { \ + float \ + data = HsailIntrin(image, coords.x, coords.y); \ + return data; \ +} + +DefLoadImage2DDepth(read_imagef, __hsail_ldimagef_2ddepth_u32, float, int) + +// Sampler-less Read Image 2d array depth + +#define DefLoadImage2DArrayDepth(Func, HsailIntrin, DstTy, CoordTy) \ + float \ + HsailIntrin(image2d_array_depth_t, CoordTy, CoordTy, CoordTy); \ + \ + __attribute__((overloadable, always_inline)) DstTy \ + Func(image2d_array_depth_t image, CoordTy##4 coords) { \ + float \ + data = HsailIntrin(image, coords.x, coords.y, coords.z); \ + return data; \ +} + +DefLoadImage2DArrayDepth(read_imagef, __hsail_ldimagef_2dadepth_u32, float, int) + +#endif // __OPENCL_C_VERSION__ >= 200
diff --git a/amd-builtins/image/write.cl b/amd-builtins/image/write.cl new file mode 100644 index 0000000..0bf5abe --- /dev/null +++ b/amd-builtins/image/write.cl
@@ -0,0 +1,141 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +// Image write built-ins + +// Hsail store image intrinsics +extern void __hsail_stimagef_1d_i32(float, float, float, float, image1d_t, int); +extern void __hsail_stimagei_1d_i32(int, int, int, int, image1d_t, int); +extern void __hsail_stimageui_1d_i32(uint, uint, uint, uint, image1d_t, int); + +extern void __hsail_stimagef_1db_i32(float, float, float, float, image1d_buffer_t, int); +extern void __hsail_stimagei_1db_i32(int, int, int, int, image1d_buffer_t, int); +extern void __hsail_stimageui_1db_i32(uint, uint, uint, uint, image1d_buffer_t, int); + +extern void __hsail_stimagef_1da_i32(float, float, float, float, image1d_array_t, int, int); +extern void __hsail_stimagei_1da_i32(int, int, int, int, image1d_array_t, int, int); +extern void __hsail_stimageui_1da_i32(uint, uint, uint, uint, image1d_array_t, int, int); + +extern void __hsail_stimagef_2d_i32(float, float, float, float, image2d_t, int, int); +extern void __hsail_stimagei_2d_i32(int, int, int, int, image2d_t, int, int); +extern void __hsail_stimageui_2d_i32(uint, uint, uint, uint, image2d_t, int, int); + +extern void __hsail_stimagef_2da_i32(float, float, float, float, image2d_array_t, int, int, int, int); +extern void __hsail_stimagei_2da_i32(int, int, int, int, image2d_array_t, int, int, int, int); +extern void __hsail_stimageui_2da_i32(uint, uint, uint, uint, image2d_array_t, int, int, int, int); + +extern void __hsail_stimagef_3d_i32(float, float, float, float, image3d_t, int, int, int, int); +extern void __hsail_stimagei_3d_i32(int, int, int, int, image3d_t, int, int, int, int); +extern void __hsail_stimageui_3d_i32(uint, uint, uint, uint, image3d_t, int, int, int, int); + + +#define DefWriteImage1d(Func,HsailIntrin,CoordTy,ValTy) \ +__attribute__((overloadable, always_inline)) void \ +Func(image1d_t image, CoordTy coords, ValTy##4 val) { \ + HsailIntrin(val.x, val.y, val.z, val.w, image, coords); \ +} + +#define DefWriteImage1dBuffer(Func,HsailIntrin,CoordTy,ValTy) \ +__attribute__((overloadable, always_inline)) void \ +Func(image1d_buffer_t image, CoordTy coords, ValTy##4 val) { \ + HsailIntrin(val.x, val.y, val.z, val.w, image, coords); \ +} + +#define DefWriteImage1dArray(Func,HsailIntrin,CoordTy,ValTy) \ +__attribute__((overloadable, always_inline)) void \ + Func(image1d_array_t image, CoordTy##2 coords, ValTy##4 val) { \ + HsailIntrin(val.x, val.y, val.z, val.w, image, coords.x, coords.y); \ +} + +#define DefWriteImage2d(Func,HsailIntrin,CoordTy,ValTy) \ +__attribute__((overloadable, always_inline)) void \ +Func(image2d_t image, CoordTy##2 coords, ValTy##4 val) { \ + HsailIntrin(val.x, val.y, val.z, val.w, image, coords.x, coords.y); \ +} + +#define DefWriteImage2dArray(Func,HsailIntrin,CoordTy,ValTy) \ +__attribute__((overloadable, always_inline)) void \ +Func(image2d_array_t image, CoordTy##4 coords, ValTy##4 val) { \ + HsailIntrin(val.x, val.y, val.z, val.w, image, coords.x, coords.y, coords.z, coords.w); \ +} + +#define DefWriteImage3d(Func,HsailIntrin,CoordTy,ValTy) \ +__attribute__((overloadable, always_inline)) void \ +Func(image3d_t image, CoordTy##4 coords, ValTy##4 val) { \ + HsailIntrin(val.x, val.y, val.z, val.w, image, coords.x, coords.y, coords.z, coords.w); \ +} + +// Write Image 1d +DefWriteImage1d(write_imagef, __hsail_stimagef_1d_i32, int, float) +DefWriteImage1d(write_imagei, __hsail_stimagei_1d_i32, int, int) +DefWriteImage1d(write_imageui, __hsail_stimageui_1d_i32, int, uint) + +// Write Image 1d Array +DefWriteImage1dArray(write_imagef, __hsail_stimagef_1da_i32, int, float) +DefWriteImage1dArray(write_imagei, __hsail_stimagei_1da_i32, int, int) +DefWriteImage1dArray(write_imageui, __hsail_stimageui_1da_i32, int, uint) + +// Write Image 1d Buffer +DefWriteImage1dBuffer(write_imagef, __hsail_stimagef_1db_i32, int, float) +DefWriteImage1dBuffer(write_imagei, __hsail_stimagei_1db_i32, int, int) +DefWriteImage1dBuffer(write_imageui, __hsail_stimageui_1db_i32, int, uint) + +// Write Image 2d +DefWriteImage2d(write_imagef, __hsail_stimagef_2d_i32, int, float) +DefWriteImage2d(write_imagei, __hsail_stimagei_2d_i32, int, int) +DefWriteImage2d(write_imageui, __hsail_stimageui_2d_i32, int, uint) + +// Write Image 2d Array +DefWriteImage2dArray(write_imagef, __hsail_stimagef_2da_i32, int, float) +DefWriteImage2dArray(write_imagei, __hsail_stimagei_2da_i32, int, int) +DefWriteImage2dArray(write_imageui, __hsail_stimageui_2da_i32, int, uint) + +// Write Image 3d +DefWriteImage3d(write_imagef, __hsail_stimagef_3d_i32, int, float) +DefWriteImage3d(write_imagei, __hsail_stimagei_3d_i32, int, int) +DefWriteImage3d(write_imageui, __hsail_stimageui_3d_i32, int, uint) + +#ifdef __clang__ +// Image-2.0 write built-ins + +// Hsail store image intrinsics +extern void __hsail_stimagef_2ddepth_i32(float, image2d_depth_t, int, int); +extern void __hsail_stimagef_2dadepth_i32(float, image2d_array_depth_t, int, int, int, int); + +#define DefWriteImage2dDepth(Func, HsailIntrin, CoordTy, ValTy) \ +__attribute__((overloadable, always_inline)) void \ +Func(image2d_depth_t image, CoordTy##2 coords, ValTy val) { \ + HsailIntrin(val, image, coords.x, coords.y); \ +} + +#define DefWriteImage2dArrayDepth(Func, HsailIntrin, CoordTy, ValTy) \ +__attribute__((overloadable, always_inline)) void \ +Func(image2d_array_depth_t image, CoordTy##4 coords, ValTy val) { \ + HsailIntrin(val, image, coords.x, coords.y, coords.z, coords.w); \ +} + +// Write Image 2d Depth +DefWriteImage2dDepth(write_imagef, __hsail_stimagef_2ddepth_i32, int, float) + +// Write Image 2d Array Depth +DefWriteImage2dArrayDepth(write_imagef, __hsail_stimagef_2dadepth_i32, int, float) + +#endif \ No newline at end of file
diff --git a/amd-builtins/int/abs_base.cl b/amd-builtins/int/abs_base.cl new file mode 100644 index 0000000..d0c2317 --- /dev/null +++ b/amd-builtins/int/abs_base.cl
@@ -0,0 +1,93 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "ibuiltins.h" + +// ----- char ----- + +__attribute__((overloadable, always_inline)) uchar +abs(char x) +{ + char s = x >> 7; + return (uchar)((x + s) ^ s); +} + + +// ----- uchar ----- + +__attribute__((overloadable, always_inline)) uchar +abs(uchar x) +{ + return x; +} + +// ----- short ----- + +__attribute__((overloadable, always_inline)) ushort +abs(short x) +{ + short s = x >> 15; + return (ushort)((x + s) ^ s); +} + +// ----- ushort ----- + +__attribute__((overloadable, always_inline)) ushort +abs(ushort x) +{ + return x; +} + +// ----- int ----- + +__attribute__((overloadable, always_inline)) uint +abs(int x) +{ + int s = x >> 31; + return (uint)((x + s) ^ s); +} + +// ----- uint ----- + +__attribute__((overloadable, always_inline)) uint +abs(uint x) +{ + return x; +} + +// ----- long ----- + +__attribute__((overloadable, always_inline)) ulong +abs(long x) +{ + long s = x >> 63; + return (ulong)((x + s) ^ s); +} + +// ----- ulong ----- + +__attribute__((overloadable, always_inline)) ulong +abs(ulong x) +{ + return x; +} +
diff --git a/amd-builtins/int/abs_diff_base.cl b/amd-builtins/int/abs_diff_base.cl new file mode 100644 index 0000000..0cf2007 --- /dev/null +++ b/amd-builtins/int/abs_diff_base.cl
@@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// ----- char ----- + +__attribute__((overloadable, always_inline)) uchar +abs_diff(char x, char y) +{ + int ix = x; + int iy = y; + int d = max(ix,iy) - min(ix,iy); + return (uchar)d; +} + +// ----- uchar ----- + +__attribute__((overloadable, always_inline)) uchar +abs_diff(uchar x, uchar y) +{ + uint ux = x; + uint uy = y; + uint d = max(ux,uy) - min(ux,uy); + return (uchar)d; +} + +// ----- short ----- + +__attribute__((overloadable, always_inline)) ushort +abs_diff(short x, short y) +{ + int ix = x; + int iy = y; + int d = max(ix,iy) - min(ix,iy); + return (ushort)d; +} + +// ----- ushort ----- + +__attribute__((overloadable, always_inline)) ushort +abs_diff(ushort x, ushort y) +{ + uint ux = x; + uint uy = y; + uint d = max(ux,uy) - min(ux,uy); + return (ushort)d; +} + +// ----- int ----- + +__attribute__((overloadable, always_inline)) uint +abs_diff(int x, int y) +{ + return (uint)(max(x,y) - min(x,y)); +} + +// ----- uint ----- + +__attribute__((overloadable, always_inline)) uint +abs_diff(uint x, uint y) +{ + return max(x,y) - min(x,y); +} + +// ----- long ----- + +__attribute__((overloadable, always_inline)) ulong +abs_diff(long x, long y) +{ + long xmy = x - y; + long ymx = y - x; + return (ulong)(x > y ? xmy : ymx); +} + +// ----- ulong ----- + +__attribute__((overloadable, always_inline)) ulong +abs_diff(ulong x, ulong y) +{ + ulong xmy = x - y; + ulong ymx = y - x; + return x > y ? xmy : ymx; +} +
diff --git a/amd-builtins/int/add_sat_base.cl b/amd-builtins/int/add_sat_base.cl new file mode 100644 index 0000000..7adc9f8 --- /dev/null +++ b/amd-builtins/int/add_sat_base.cl
@@ -0,0 +1,101 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// ----- char ----- + +__attribute__((overloadable, always_inline)) char +add_sat(char x, char y) +{ + int s = (int)x + (int) y; + return max(-128, min(127, s)); +} + +// ----- uchar ----- + +__attribute__((overloadable, always_inline)) uchar +add_sat(uchar x, uchar y) +{ + uint s = (uint)x + (uint)y; + return min(255U, s); +} + + +// ----- short ----- + +__attribute__((overloadable, always_inline)) short +add_sat(short x, short y) +{ + int s = (int)x + (int) y; + return max(-32768, min(32767, s)); +} + +// ----- ushort ----- + +__attribute__((overloadable, always_inline)) ushort +add_sat(ushort x, ushort y) +{ + uint s = (uint)x + (uint)y; + return min(65535U, s); +} + +// ----- int ----- + +__attribute__((overloadable, always_inline)) int +add_sat(int x, int y) +{ + int s = x + y; + s = y < 1 & (int)0x80000000 - y > x ? (int)0x80000000 : s; + s = y > 0 & 0x7fffffff - y < x ? 0x7fffffff : s; + return s; +} + +// ----- uint ----- + +__attribute__((overloadable, always_inline)) uint +add_sat(uint x, uint y) +{ + uint s = x + y; + s = 0xffffffffU - y < x ? 0xffffffffU : s; + return s; +} + +// ----- long ----- + +__attribute__((overloadable, always_inline)) long +add_sat(long x, long y) +{ + long s = x + y; + s = y < 1 & (long)0x8000000000000000 - y > x ? (long)0x8000000000000000 : s; + s = y > 0 & 0x7fffffffffffffffL - y < x ? 0x7fffffffffffffffL : s; + return s; +} + +// ----- ulong ----- + +__attribute__((overloadable, always_inline)) ulong +add_sat(ulong x, ulong y) +{ + ulong s = x + y; + s = 0xffffffffffffffffUL - y < x ? 0xffffffffffffffffUL : s; + return s; +} +
diff --git a/amd-builtins/int/clz_base.cl b/amd-builtins/int/clz_base.cl new file mode 100644 index 0000000..21f2c5f --- /dev/null +++ b/amd-builtins/int/clz_base.cl
@@ -0,0 +1,80 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "ibuiltins.h" + +__attribute__((always_inline)) static uint +myclz4(uint x) +{ + uint z = __hsail_firstbit_u32(x); + return x == 0U ? 32U : z; +} + +// ----- [u]char ----- + +__attribute__((overloadable, always_inline)) char +clz(char x) +{ + return myclz4((uint)x & 0xffU) - 24U; +} + +__attribute__((overloadable, always_inline)) uchar +clz(uchar x) +{ + return myclz4((uint)x) - 24U; +} + +// ----- [u]short ----- + +__attribute__((overloadable, always_inline)) short +clz(short x) +{ + return myclz4((uint)x & 0xffffU) - 16U; +} + +__attribute__((overloadable, always_inline)) ushort +clz(ushort x) +{ + return myclz4((uint)x) - 16U; +} + +// ----- [u]int ----- + +extern __attribute__((overloadable, alias("myclz4"))) uint clz(uint); +extern __attribute__((overloadable, alias("myclz4"))) int clz(int); + +// ----- [u]long ----- + +__attribute__((always_inline)) static ulong +myclz8(ulong x) +{ + uint xlo = (uint)x; + uint xhi = (uint)(x >> 32); + uint zlo = __hsail_firstbit_u32(xlo); + uint zhi = __hsail_firstbit_u32(xhi); + uint clo = (xlo == 0 ? 32 : zlo) + 32; + return xhi == 0 ? clo : zhi; +} + +extern __attribute__((overloadable, alias("myclz8"))) ulong clz(ulong); +extern __attribute__((overloadable, alias("myclz8"))) long clz(long); +
diff --git a/amd-builtins/int/ctz_base.cl b/amd-builtins/int/ctz_base.cl new file mode 100644 index 0000000..545ac03 --- /dev/null +++ b/amd-builtins/int/ctz_base.cl
@@ -0,0 +1,87 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "ibuiltins.h" + +#if __OPENCL_C_VERSION__ >= 200 + +// ----- [u]char ----- + +__attribute__((overloadable, always_inline)) char +ctz(char x) +{ + uint z = __hsail_lastbit_u32((uint)x & 0xffU); + return x == 0 ? 8U : z; +} + +__attribute__((overloadable, always_inline)) uchar +ctz(uchar x) +{ + uint z = __hsail_lastbit_u32((uint)x); + return x == 0 ? 8U : z; +} + +// ----- [u]short ----- + +__attribute__((overloadable, always_inline)) short +ctz(short x) +{ + uint z = __hsail_lastbit_u32((uint)x & 0xffffU); + return x == 0 ? 16U : z; +} + +__attribute__((overloadable, always_inline)) ushort +ctz(ushort x) +{ + uint z = __hsail_lastbit_u32((uint)x); + return x == 0 ? 16U : z; +} + +// ----- [u]int ----- + +__attribute__((always_inline)) static uint +myctz4(uint x) +{ + uint z = __hsail_lastbit_u32(x); + return x == 0U ? 32 : z; +} + +extern __attribute__((overloadable, alias("myctz4"))) uint ctz(uint); +extern __attribute__((overloadable, alias("myctz4"))) int ctz(int); + +// ----- [u]long ----- + +__attribute__((always_inline)) static ulong +myctz8(ulong x) +{ + uint xhi = x >> 32; + uint xlo = (uint)x; + uint zhi = __hsail_lastbit_u32(xhi); + uint zlo = __hsail_lastbit_u32(xlo); + uint chi = (xhi == 0 ? 32 : zhi) + 32; + return xlo == 0 ? chi : zlo; +} + +extern __attribute__((overloadable, alias("myctz8"))) ulong ctz(ulong); +extern __attribute__((overloadable, alias("myctz8"))) long ctz(long); + +#endif
diff --git a/amd-builtins/int/hadd_base.cl b/amd-builtins/int/hadd_base.cl new file mode 100644 index 0000000..b19ec2c --- /dev/null +++ b/amd-builtins/int/hadd_base.cl
@@ -0,0 +1,94 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// ----- char ----- + +__attribute__((overloadable, always_inline)) char +hadd(char x, char y) +{ + // compiler automatically casts larger + return (x + y) >> 1; +} + +// ----- uchar ----- + +__attribute__((overloadable, always_inline)) uchar +hadd(uchar x, uchar y) +{ + // compiler automatically casts larger + return (x + y) >> 1; +} + +// ----- short ----- + +__attribute__((overloadable, always_inline)) short +hadd(short x, short y) +{ + // compiler automatically casts larger + return (x + y) >> 1; +} + +// ----- ushort ----- + +__attribute__((overloadable, always_inline)) ushort +hadd(ushort x, ushort y) +{ + // compiler automatically casts larger + return (x + y) >> 1; +} + +// ----- int ----- + +__attribute__((overloadable, always_inline)) int +hadd(int x, int y) +{ + int cin = (x & 1) & y; + return (x >> 1) + (y >> 1) + cin; +} + +// ----- uint ----- + +__attribute__((overloadable, always_inline)) uint +hadd(uint x, uint y) +{ + uint cin = (x & 1U) & y; + return (x >> 1) + (y >> 1) + cin; +} + +// ----- long ----- + +__attribute__((overloadable, always_inline)) long +hadd(long x, long y) +{ + long cin = (x & 1) & y; + return (x >> 1) + (y >> 1) + cin; +} + +// ----- ulong ----- + +__attribute__((overloadable, always_inline)) ulong +hadd(ulong x, ulong y) +{ + ulong cin = (x & 1) & y; + return (x >> 1) + (y >> 1) + cin; +} +
diff --git a/amd-builtins/int/ibuiltins.h b/amd-builtins/int/ibuiltins.h new file mode 100644 index 0000000..a46ae6f --- /dev/null +++ b/amd-builtins/int/ibuiltins.h
@@ -0,0 +1,74 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +extern __attribute__((pure)) int __amdil_count_bits_i32(int); +extern __attribute__((pure)) int __hsail_firstbit_u32(uint); +extern __attribute__((pure)) int __hsail_lastbit_u32(uint); + +extern __attribute__((pure)) int __amdil_imad24_i32(int, int, int); +extern __attribute__((pure)) uint __amdil_umad24_u32(uint, uint, uint); +extern __attribute__((pure)) int __amdil_imul24_i32(int, int); +extern __attribute__((pure)) uint __amdil_umul24_u32(uint, uint); + +extern __attribute__((pure)) int __amdil_imin_i32(int, int); +extern __attribute__((pure)) int __amdil_imax_i32(int, int); +extern __attribute__((pure)) uint __amdil_umin_u32(uint, uint); +extern __attribute__((pure)) uint __amdil_umax_u32(uint, uint); + +extern __attribute__((pure)) long __amdil_imin_i64(long, long); +extern __attribute__((pure)) long __amdil_imax_i64(long, long); +extern __attribute__((pure)) ulong __amdil_umin_u64(ulong, ulong); +extern __attribute__((pure)) ulong __amdil_umax_u64(ulong, ulong); + +extern __attribute__((pure)) int __amdil_imul_high_i32(int, int); +extern __attribute__((pure)) uint __amdil_umul_high_u32(uint, uint); + +static inline long +_gpu_mul_hi_i64(long x, long y) +{ + ulong x0 = (ulong)x & 0xffffffffUL; + long x1 = x >> 32; + ulong y0 = (ulong)y & 0xffffffffUL; + long y1 = y >> 32; + ulong z0 = x0*y0; + long t = x1*y0 + (z0 >> 32); + long z1 = t & 0xffffffffL; + long z2 = t >> 32; + z1 = x0*y1 + z1; + return x1*y1 + z2 + (z1 >> 32); +} + +static inline ulong +_gpu_mul_hi_u64(ulong x, ulong y) +{ + ulong x0 = x & 0xffffffffUL; + ulong x1 = x >> 32; + ulong y0 = y & 0xffffffffUL; + ulong y1 = y >> 32; + ulong z0 = x0*y0; + ulong t = x1*y0 + (z0 >> 32); + ulong z1 = t & 0xffffffffUL; + ulong z2 = t >> 32; + z1 = x0*y1 + z1; + return x1*y1 + z2 + (z1 >> 32); +} +
diff --git a/amd-builtins/int/mad_hi_base.cl b/amd-builtins/int/mad_hi_base.cl new file mode 100644 index 0000000..85e0631 --- /dev/null +++ b/amd-builtins/int/mad_hi_base.cl
@@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "ibuiltins.h" + +// ----- char ----- + +__attribute__((overloadable, always_inline)) char +mad_hi(char a, char b, char c) +{ + return (char)(((int)a * (int)b) >> 8) + c; +} + +// ----- uchar ----- + +__attribute__((overloadable, always_inline)) uchar +mad_hi(uchar a, uchar b, uchar c) +{ + return (uchar)(((uint)a * (uint)b) >> 8) + c; +} + +// ----- short ----- + +__attribute__((overloadable, always_inline)) short +mad_hi(short a, short b, short c) +{ + return (short)(((int)a * (int)b) >> 16) + c; +} + +// ----- ushort ----- + +__attribute__((overloadable, always_inline)) ushort +mad_hi(ushort a, ushort b, ushort c) +{ + return (ushort)(((uint)a * (uint)b) >> 16) + c; +} + +// ----- int ----- + +__attribute__((overloadable, always_inline)) int +mad_hi(int a, int b, int c) +{ + return (int)(((long)a * (long)b) >> 32) + c; +} + +// ----- uint ----- + +__attribute__((overloadable, always_inline)) uint +mad_hi(uint a, uint b, uint c) +{ + return (uint)(((ulong)a * (ulong)b) >> 32) + c; +} + + +// ----- long ----- + +__attribute__((overloadable, always_inline)) long +mad_hi(long a, long b, long c) +{ + return _gpu_mul_hi_i64(a, b) + c; +} + +// ----- ulong ----- + +__attribute__((overloadable, always_inline)) ulong +mad_hi(ulong a, ulong b, ulong c) +{ + return _gpu_mul_hi_u64(a, b) + c; +} +
diff --git a/amd-builtins/int/mad_sat_base.cl b/amd-builtins/int/mad_sat_base.cl new file mode 100644 index 0000000..cc9270c --- /dev/null +++ b/amd-builtins/int/mad_sat_base.cl
@@ -0,0 +1,141 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "ibuiltins.h" + +// ----- char ----- + +__attribute__((overloadable, always_inline)) char +mad_sat(char a, char b, char c) +{ + int s = (int)a * (int)b + (int)c; + return min(127, max(-128, s)); +} + +// ----- uchar ----- + +__attribute__((overloadable, always_inline)) uchar +mad_sat(uchar a, uchar b, uchar c) +{ + uint s = (uint)a * (uint)b + (uint)c; + return min(255U, s); +} + +// ----- short ----- + +__attribute__((overloadable, always_inline)) short +mad_sat(short a, short b, short c) +{ + int s = (int)a * (int)b + (int)c; + return min(32767, max(-32768, s)); +} + +// ----- ushort ----- + +__attribute__((overloadable, always_inline)) ushort +mad_sat(ushort a, ushort b, ushort c) +{ + uint s = (uint)a * (uint)b + (uint)c; + return min(65535U, s); +} + +// ----- int ----- + +__attribute__((overloadable, always_inline)) int +mad_sat(int a, int b, int c) +{ + int lo = a * b; + int hi = __amdil_imul_high_i32(a, b); + int t = lo + c; + hi += c > 0 & 0x7fffffff - c < lo; + hi -= c < 1 & (int)0x80000000 - c > lo; + lo = t; + + lo = hi < 0 & (hi != -1 | lo >= 0) ? 0x80000000 : lo; + lo = hi >= 0 & (hi > 0 | lo < 0) ? 0x7fffffff : lo; + return lo; +} + +// ----- uint ----- +__attribute__((overloadable, always_inline)) uint +mad_sat(uint a, uint b, uint c) +{ + uint lo = a * b; + uint hi = __amdil_umul_high_u32(a, b); + uint t = lo + c; + hi += 0xffffffff - c < lo; + lo = t; + return hi > 0U ? 0xffffffff : lo; +} + +// ----- long ----- + +__attribute__((overloadable, always_inline)) long +mad_sat(long a, long b, long c) +{ + ulong a0 = (ulong)a & 0xffffffffUL; + long a1 = a >> 32; + ulong b0 = (ulong)b & 0xffffffffUL; + long b1 = b >> 32; + ulong s0 = a0*b0; + long t = a1*b0 + (s0 >> 32); + long s1 = t & 0xffffffffL; + long s2 = t >> 32; + s1 = a0*b1 + s1; + long lo = (s1 << 32) | (s0 & 0xffffffffL); + long hi = a1*b1 + s2 + (s1 >> 32); + + t = lo + c; + hi += c > 0L & 0x7fffffffffffffffL - c < lo; + hi -= c < 1L & (long)0x8000000000000000L - c > lo; + lo = t; + + lo = hi < 0L & (hi != -1L | lo >= 0L) ? 0x8000000000000000L : lo; + lo = hi >= 0L & (hi > 0L | lo < 0L) ? 0x7fffffffffffffffL : lo; + + return lo; +} + +// ----- ulong ----- + +__attribute__((overloadable, always_inline)) ulong +mad_sat(ulong a, ulong b, ulong c) +{ + ulong a0 = a & 0xffffffffUL; + ulong a1 = a >> 32; + ulong b0 = b & 0xffffffffUL; + ulong b1 = b >> 32; + ulong s0 = a0*b0; + ulong t = a1*b0 + (s0 >> 32); + ulong s1 = t & 0xffffffffUL; + ulong s2 = t >> 32; + s1 = a0*b1 + s1; + ulong lo = (s1 << 32) | (s0 & 0xffffffffUL); + ulong hi = a1*b1 + s2 + (s1 >> 32); + + t = lo + c; + hi += 0xffffffffffffffffUL - c < lo; + lo = t; + + return hi > 0UL ? 0xffffffffffffffffUL : lo; +} +
diff --git a/amd-builtins/int/max_base.cl b/amd-builtins/int/max_base.cl new file mode 100644 index 0000000..2048256 --- /dev/null +++ b/amd-builtins/int/max_base.cl
@@ -0,0 +1,92 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "ibuiltins.h" + +// ----- char ----- + +__attribute__((overloadable, always_inline)) char +max(char x, char y) +{ + return __amdil_imax_i32(x, y); +} + +// ----- uchar ----- + +__attribute__((overloadable, always_inline)) uchar +max(uchar x, uchar y) +{ + return __amdil_umax_u32(x, y); +} + +// ----- short ----- + +__attribute__((overloadable, always_inline)) short +max(short x, short y) +{ + return __amdil_imax_i32(x, y); +} + +// ----- ushort ----- + +__attribute__((overloadable, always_inline)) ushort +max(ushort x, ushort y) +{ + return __amdil_umax_u32(x, y); +} + +// ----- int ----- + +__attribute__((overloadable, always_inline)) int +max(int x, int y) +{ + return __amdil_imax_i32(x, y); +} + +// ----- uint ----- + +__attribute__((overloadable, always_inline)) uint +max(uint x, uint y) +{ + return __amdil_umax_u32(x, y); +} + +// ----- long ----- + +// __hsail_ intrinsics which has no __amdil_ equivalents. +extern __attribute__((pure)) long __hsail_max_s64(long, long); +extern __attribute__((pure)) ulong __hsail_max_u64(ulong, ulong); + +__attribute__((overloadable, always_inline)) long +max(long x, long y) +{ + return __hsail_max_s64(x, y); +} + +// ----- ulong ----- + +__attribute__((overloadable, always_inline)) ulong +max(ulong x, ulong y) +{ + return __hsail_max_u64(x, y); +} +
diff --git a/amd-builtins/int/min_base.cl b/amd-builtins/int/min_base.cl new file mode 100644 index 0000000..798ad57 --- /dev/null +++ b/amd-builtins/int/min_base.cl
@@ -0,0 +1,92 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "ibuiltins.h" + +// ----- char ----- + +__attribute__((overloadable, always_inline)) char +min(char x, char y) +{ + return __amdil_imin_i32(x, y); +} + +// ----- uchar ----- + +__attribute__((overloadable, always_inline)) uchar +min(uchar x, uchar y) +{ + return __amdil_umin_u32(x, y); +} + +// ----- short ----- + +__attribute__((overloadable, always_inline)) short +min(short x, short y) +{ + return __amdil_imin_i32(x, y); +} + +// ----- ushort ----- + +__attribute__((overloadable, always_inline)) ushort +min(ushort x, ushort y) +{ + return __amdil_umin_u32(x, y); +} + +// ----- int ----- + +__attribute__((overloadable, always_inline)) int +min(int x, int y) +{ + return __amdil_imin_i32(x, y); +} + +// ----- uint ----- + +__attribute__((overloadable, always_inline)) uint +min(uint x, uint y) +{ + return __amdil_umin_u32(x, y); +} + +// ----- long ----- + +// __hsail_ intrinsics which has no __amdil_ equivalents. +extern __attribute__((pure)) long __hsail_min_s64(long, long); +extern __attribute__((pure)) ulong __hsail_min_u64(ulong, ulong); + +__attribute__((overloadable, always_inline)) long +min(long x, long y) +{ + return __hsail_min_s64(x, y); +} + +// ----- ulong ----- + +__attribute__((overloadable, always_inline)) ulong +min(ulong x, ulong y) +{ + return __hsail_min_u64(x, y); +} +
diff --git a/amd-builtins/int/mul24_base.cl b/amd-builtins/int/mul24_base.cl new file mode 100644 index 0000000..0184c65 --- /dev/null +++ b/amd-builtins/int/mul24_base.cl
@@ -0,0 +1,51 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "ibuiltins.h" + +// ----- int ----- + +__attribute__((overloadable, always_inline)) int +mul24(int x, int y) +{ + return __amdil_imul24_i32(x, y); +} + +__attribute__((overloadable, always_inline)) int +mad24(int a, int b, int c) +{ + return __amdil_imad24_i32(a, b, c); +} + +// ----- uint ----- + +__attribute__((overloadable, always_inline)) uint +mul24(uint x, uint y) +{ + return __amdil_umul24_u32(x, y); +} + +__attribute__((overloadable, always_inline)) uint +mad24(uint a, uint b, uint c) +{ + return __amdil_umad24_u32(a, b, c); +}
diff --git a/amd-builtins/int/mul_hi_base.cl b/amd-builtins/int/mul_hi_base.cl new file mode 100644 index 0000000..63d7cc6 --- /dev/null +++ b/amd-builtins/int/mul_hi_base.cl
@@ -0,0 +1,91 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "ibuiltins.h" + +// ----- char ----- + +__attribute__((overloadable, always_inline)) char +mul_hi(char x, char y) +{ + return (char)(((int)x * (int)y) >> 8); +} + +// ----- uchar ----- + +__attribute__((overloadable, always_inline)) uchar +mul_hi(uchar x, uchar y) +{ + return (uchar)(((uint)x * (uint)y) >> 8); +} + +// ----- short ----- + +__attribute__((overloadable, always_inline)) short +mul_hi(short x, short y) +{ + return (short)(((int)x * (int)y) >> 16); +} + +// ----- ushort ----- + +__attribute__((overloadable, always_inline)) ushort +mul_hi(ushort x, ushort y) +{ + return (ushort)(((uint)x * (uint)y) >> 16); +} + +// ----- int ----- + +__attribute__((overloadable, always_inline)) int +mul_hi(int x, int y) +{ + return __amdil_imul_high_i32(x, y); +} + +// ----- uint ----- + +__attribute__((overloadable, always_inline)) uint +mul_hi(uint x, uint y) +{ + return __amdil_umul_high_u32(x, y); +} + +extern __attribute__((pure)) long __hsail_mulhi_s64(long, long); +extern __attribute__((pure)) ulong __hsail_mulhi_u64(ulong, ulong); + +// ----- long ----- + +__attribute__((overloadable, always_inline)) long +mul_hi(long x, long y) +{ + return __hsail_mulhi_s64(x, y); +} + +// ----- ulong ----- + +__attribute__((overloadable, always_inline)) ulong +mul_hi(ulong x, ulong y) +{ + return __hsail_mulhi_u64(x, y); +} +
diff --git a/amd-builtins/int/popcnt_base.cl b/amd-builtins/int/popcnt_base.cl new file mode 100644 index 0000000..79d0720 --- /dev/null +++ b/amd-builtins/int/popcnt_base.cl
@@ -0,0 +1,116 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "ibuiltins.h" + +#ifdef USE_POPCNT +#pragma OPENCL EXTENSION cl_amd_popcnt : enable +#endif + +// ----- [u]char ----- + +__attribute__((always_inline)) static char +__POPCI1(char x) +{ + return __amdil_count_bits_i32((int)x & 0xff); +} + +#ifdef USE_POPCNT +extern __attribute__((overloadable, alias("__POPCI1"))) char popcnt(char); +#endif + +extern __attribute__((overloadable, alias("__POPCI1"))) char popcount(char); + +__attribute__((always_inline)) static uchar +__POPCU1(uchar x) +{ + return __amdil_count_bits_i32((int)x); +} + +#ifdef USE_POPCNT +extern __attribute__((overloadable, alias("__POPCU1"))) uchar popcnt(uchar); +#endif + +extern __attribute__((overloadable, alias("__POPCU1"))) uchar popcount(uchar); + + +// ----- [u]short ----- + +__attribute__((always_inline)) static short +__POPCI2(short x) +{ + return __amdil_count_bits_i32((int)x & 0xffff); +} + +#ifdef USE_POPCNT +extern __attribute__((overloadable, alias("__POPCI2"))) short popcnt(short); +#endif + +extern __attribute__((overloadable, alias("__POPCI2"))) short popcount(short); + +__attribute__((always_inline)) static ushort +__POPCU2(ushort x) +{ + return __amdil_count_bits_i32((int)x); +} + +#ifdef USE_POPCNT +extern __attribute__((overloadable, alias("__POPCU2"))) ushort popcnt(ushort); +#endif + +extern __attribute__((overloadable, alias("__POPCU2"))) ushort popcount(ushort); + + +// ----- [u]int ----- + +__attribute__((always_inline)) static int +__POPCI4(int x) +{ + return __amdil_count_bits_i32(x); +} + +#ifdef USE_POPCNT +extern __attribute__((overloadable, alias("__POPCI4"))) int popcnt(int); +extern __attribute__((overloadable, alias("__POPCI4"))) uint popcnt(uint); +#endif + +extern __attribute__((overloadable, alias("__POPCI4"))) int popcount(int); +extern __attribute__((overloadable, alias("__POPCI4"))) uint popcount(uint); + +// ----- [u]long ----- + +__attribute__((always_inline)) static long +__POPCI8(long x) +{ + int chi = __amdil_count_bits_i32((int)(x >> 32)); + int clo = __amdil_count_bits_i32((int)(x & 0xffffffffL)); + return chi + clo; +} + +#ifdef USE_POPCNT +extern __attribute__((overloadable, alias("__POPCI8"))) long popcnt(long); +extern __attribute__((overloadable, alias("__POPCI8"))) ulong popcnt(ulong); +#endif + +extern __attribute__((overloadable, alias("__POPCI8"))) long popcount(long); +extern __attribute__((overloadable, alias("__POPCI8"))) ulong popcount(ulong); +
diff --git a/amd-builtins/int/rhadd_base.cl b/amd-builtins/int/rhadd_base.cl new file mode 100644 index 0000000..e33168f --- /dev/null +++ b/amd-builtins/int/rhadd_base.cl
@@ -0,0 +1,93 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + + +// ----- char ----- + +__attribute__((overloadable, always_inline)) char +rhadd(char x, char y) +{ + // compiler automatically casts larger + return (x + y + 1) >> 1; +} + +// ----- uchar ----- + +__attribute__((overloadable, always_inline)) uchar +rhadd(uchar x, uchar y) +{ + // compiler automatically casts larger + return (x + y + 1U) >> 1; +} + +// ----- short ----- + +__attribute__((overloadable, always_inline)) short +rhadd(short x, short y) +{ + // compiler automatically casts larger + return (x + y + 1) >> 1; +} + +// ----- ushort ----- + +__attribute__((overloadable, always_inline)) ushort +rhadd(ushort x, ushort y) +{ + // compiler automatically casts larger + return (x + y + 1U) >> 1; +} + +// ----- int ----- + +__attribute__((overloadable, always_inline)) int +rhadd(int x, int y) +{ + int cin = (x | y) & 1; + return (x >> 1) + (y >> 1) + cin; +} + +// ----- uint ----- +__attribute__((overloadable, always_inline)) uint +rhadd(uint x, uint y) +{ + uint cin = (x | y) & 1; + return (x >> 1) + (y >> 1) + cin; +} + +// ----- long ----- +__attribute__((overloadable, always_inline)) long +rhadd(long x, long y) +{ + long cin = (x | y) & 1; + return (x >> 1) + (y >> 1) + cin; +} + +// ----- ulong ----- + +__attribute__((overloadable, always_inline)) ulong +rhadd(ulong x, ulong y) +{ + ulong cin = (x | y) & 1; + return (x >> 1) + (y >> 1) + cin; +} +
diff --git a/amd-builtins/int/rotate_base.cl b/amd-builtins/int/rotate_base.cl new file mode 100644 index 0000000..a0ecf78 --- /dev/null +++ b/amd-builtins/int/rotate_base.cl
@@ -0,0 +1,72 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "ibuiltins.h" + +// ----- [u]char ----- + +__attribute__((always_inline)) static uchar +__ROTI1(uchar x, uchar y) +{ + y &= 0x7; + return (x << y) | (x >> (8-y)); +} + +extern __attribute__((overloadable, alias("__ROTI1"))) uchar rotate(uchar, uchar); +extern __attribute__((overloadable, alias("__ROTI1"))) char rotate(char, char); + +// ----- [u]short ----- + +__attribute__((always_inline)) static ushort +__ROTI2(ushort x, ushort y) +{ + y &= 0xf; + return (x << y) | (x >> (16-y)); +} + +extern __attribute__((overloadable, alias("__ROTI2"))) ushort rotate(ushort, ushort); +extern __attribute__((overloadable, alias("__ROTI2"))) short rotate(short, short); + +// ----- [u]int ----- +extern __attribute__((const)) uint __hsail_bitalign_b32(uint, uint, uint); + +__attribute__((always_inline)) static uint +__ROTI4(uint x, uint y) +{ + return __hsail_bitalign_b32(x, x, (-y) & 0x1f); +} + +extern __attribute__((overloadable, alias("__ROTI4"))) uint rotate(uint, uint); +extern __attribute__((overloadable, alias("__ROTI4"))) int rotate(int, int); + +// ----- [u]long ----- + +__attribute__((always_inline)) static ulong +__ROTI8(ulong x, ulong y) +{ + y &= 0x3f; + return (x << y) | (x >> (64-y)); +} + +extern __attribute__((overloadable, alias("__ROTI8"))) ulong rotate(ulong, ulong); +extern __attribute__((overloadable, alias("__ROTI8"))) long rotate(long, long); +
diff --git a/amd-builtins/int/sub_sat_base.cl b/amd-builtins/int/sub_sat_base.cl new file mode 100644 index 0000000..cbca16c --- /dev/null +++ b/amd-builtins/int/sub_sat_base.cl
@@ -0,0 +1,98 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// ----- char ----- + +__attribute__((overloadable, always_inline)) char +sub_sat(char x, char y) +{ + int s = (int)x - (int) y; + return max(-128, min(127, s)); +} + +// ----- uchar ----- + +__attribute__((overloadable, always_inline)) uchar +sub_sat(uchar x, uchar y) +{ + int s = (int)x - (int)y; + return (uchar)max(s, 0); +} + +// ----- short ----- + +__attribute__((overloadable, always_inline)) short +sub_sat(short x, short y) +{ + int s = (int)x - (int) y; + return max(-32768, min(32767, s)); +} + +// ----- ushort ----- + +__attribute__((overloadable, always_inline)) ushort +sub_sat(ushort x, ushort y) +{ + int s = (int)x - (int)y; + return (ushort)max(s, 0); +} + +// ----- int ----- + +__attribute__((overloadable, always_inline)) int +sub_sat(int x, int y) +{ + int s = x - y; + s = y < 1 & 0x7fffffff + y < x ? 0x7fffffff : s; + s = y > 0 & (int)0x80000000 + y > x ? (int)0x80000000 : s; + return s; +} + +// ----- uint ----- + +__attribute__((overloadable, always_inline)) uint +sub_sat(uint x, uint y) +{ + uint s = x - y; + return y > x ? 0U : s; +} + +// ----- long ----- + +__attribute__((overloadable, always_inline)) long +sub_sat(long x, long y) +{ + long s = x - y; + s = y < 1 & 0x7fffffffffffffffL + y < x ? 0x7fffffffffffffffL : s; + s = y > 0 & (long)0x8000000000000000L + y > x ? (long)0x8000000000000000L : s; + return s; +} + +// ----- ulong ----- + +__attribute__((overloadable, always_inline)) ulong +sub_sat(ulong x, ulong y) +{ + ulong s = x - y; + return y > x ? 0UL : s; +} +
diff --git a/amd-builtins/int/upsample_base.cl b/amd-builtins/int/upsample_base.cl new file mode 100644 index 0000000..7cb0e60 --- /dev/null +++ b/amd-builtins/int/upsample_base.cl
@@ -0,0 +1,65 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "ibuiltins.h" + +// ----- (u)char ----- + +__attribute__((overloadable, always_inline)) ushort +upsample(uchar hi, uchar lo) +{ + return ((ushort)hi << 8) | lo; +} + +__attribute__((overloadable, always_inline)) short +upsample(char hi, uchar lo) +{ + return ((short)hi << 8) | lo; +} + +// ----- (u)short ----- + +__attribute__((overloadable, always_inline)) uint +upsample(ushort hi, ushort lo) +{ + return ((uint)hi << 16) | lo; +} + +__attribute__((overloadable, always_inline)) int +upsample(short hi, ushort lo) +{ + return ((int)hi << 16) | lo; +} + +// ----- (u)int ----- + +__attribute__((overloadable, always_inline)) ulong +upsample(uint hi, uint lo) +{ + return ((ulong)hi << 32) | lo; +} + +__attribute__((overloadable, always_inline)) long +upsample(int hi, uint lo) +{ + return ((long)hi << 32) | lo; +} +
diff --git a/amd-builtins/math32/acosF.cl b/amd-builtins/math32/acosF.cl new file mode 100644 index 0000000..a55faee --- /dev/null +++ b/amd-builtins/math32/acosF.cl
@@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +acos(float x) +{ + // Computes arccos(x). + // The argument is first reduced by noting that arccos(x) + // is invalid for abs(x) > 1. For denormal and small + // arguments arccos(x) = pi/2 to machine accuracy. + // Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arccos(x) = pi/2 - arcsin(x) + // = pi/2 - (x + x^3*R(x^2)) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + + // Some constants and split constants. + const float piby2 = 1.5707963705e+00F; + const float pi = 3.1415926535897933e+00F; + const float piby2_head = 1.5707963267948965580e+00F; + const float piby2_tail = 6.12323399573676603587e-17F; + + uint ux = as_uint(x); + uint aux = ux & ~SIGNBIT_SP32; + int xneg = ux != aux; + int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + float y = as_float(aux); + + // transform if |x| >= 0.5 + int transform = xexp >= -1; + + float y2 = y * y; + float yt = 0.5f * (1.0f - y); + float r = transform ? yt : y2; + + // Use a rational approximation for [0.0, 0.5] + float a = mad(r, + mad(r, + mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F), + -0.0565298683201845211985026327361F), + 0.184161606965100694821398249421F); + + float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F); + float u = r * MATH_DIVIDE(a, b); + + float s = MATH_SQRT(r); + y = s; + float s1 = as_float(as_uint(s) & 0xffff0000); + float c = MATH_DIVIDE(mad(s1, -s1, r), s + s1); + float rettn = mad(s + mad(y, u, -piby2_tail), -2.0f, pi); + float rettp = 2.0F * (s1 + mad(y, u, c)); + float rett = xneg ? rettn : rettp; + float ret = piby2_head - (x - mad(x, -u, piby2_tail)); + + ret = transform ? rett : ret; + ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret; + ret = ux == 0x3f800000U ? 0.0f : ret; + ret = ux == 0xbf800000U ? pi : ret; + ret = xexp < -26 ? piby2 : ret; + return ret; +} +
diff --git a/amd-builtins/math32/acoshF.cl b/amd-builtins/math32/acoshF.cl new file mode 100644 index 0000000..ca8e3c7 --- /dev/null +++ b/amd-builtins/math32/acoshF.cl
@@ -0,0 +1,49 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +acosh(float x) +{ + uint ux = as_uint(x); + + // Arguments greater than 1/sqrt(epsilon) in magnitude are + // approximated by acosh(x) = ln(2) + ln(x) + // For 2.0 <= x <= 1/sqrt(epsilon) the approximation is + // acosh(x) = ln(x + sqrt(x*x-1)) */ + int high = ux > 0x46000000U; + int med = ux > 0x40000000U; + + float w = x - 1.0f; + float s = w*w + 2.0f*w; + float t = x*x - 1.0f; + float r = MATH_SQRT(med ? t : s) + (med ? x : w); + float v = (high ? x : r) - (med ? 1.0f : 0.0f); + float z = log1p(v) + (high ? 0x1.62e430p-1f : 0.0f); + + z = ux >= PINFBITPATT_SP32 ? x : z; + z = x < 1.0f ? as_float(QNANBITPATT_SP32) : z; + + return z; +} +
diff --git a/amd-builtins/math32/acospiF.cl b/amd-builtins/math32/acospiF.cl new file mode 100644 index 0000000..128f84b --- /dev/null +++ b/amd-builtins/math32/acospiF.cl
@@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +acospi(float x) +{ + // Computes arccos(x). + // The argument is first reduced by noting that arccos(x) + // is invalid for abs(x) > 1. For denormal and small + // arguments arccos(x) = pi/2 to machine accuracy. + // Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arccos(x) = pi/2 - arcsin(x) + // = pi/2 - (x + x^3*R(x^2)) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + + // Some constants and split constants. + const float pi = 3.1415926535897933e+00f; + const float piby2_head = 1.5707963267948965580e+00f; /* 0x3ff921fb54442d18 */ + const float piby2_tail = 6.12323399573676603587e-17f; /* 0x3c91a62633145c07 */ + + uint ux = as_uint(x); + uint aux = ux & ~SIGNBIT_SP32; + int xneg = ux != aux; + int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + + float y = as_float(aux); + + // transform if |x| >= 0.5 + int transform = xexp >= -1; + + float y2 = y * y; + float yt = 0.5f * (1.0f - y); + float r = transform ? yt : y2; + + // Use a rational approximation for [0.0, 0.5] + float a = mad(r, mad(r, mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F), + -0.0565298683201845211985026327361F), + 0.184161606965100694821398249421F); + float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F); + float u = r * MATH_DIVIDE(a, b); + + float s = MATH_SQRT(r); + y = s; + float s1 = as_float(as_uint(s) & 0xffff0000); + float c = MATH_DIVIDE(r - s1 * s1, s + s1); + // float rettn = 1.0f - MATH_DIVIDE(2.0f * (s + (y * u - piby2_tail)), pi); + float rettn = 1.0f - MATH_DIVIDE(2.0f * (s + mad(y, u, -piby2_tail)), pi); + // float rettp = MATH_DIVIDE(2.0F * s1 + (2.0F * c + 2.0F * y * u), pi); + float rettp = MATH_DIVIDE(2.0f*(s1 + mad(y, u, c)), pi); + float rett = xneg ? rettn : rettp; + // float ret = MATH_DIVIDE(piby2_head - (x - (piby2_tail - x * u)), pi); + float ret = MATH_DIVIDE(piby2_head - (x - mad(x, -u, piby2_tail)), pi); + + ret = transform ? rett : ret; + ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret; + ret = ux == 0x3f800000U ? 0.0f : ret; + ret = ux == 0xbf800000U ? 1.0f : ret; + ret = xexp < -26 ? 0.5f : ret; + return ret; +} +
diff --git a/amd-builtins/math32/all_native32.cl b/amd-builtins/math32/all_native32.cl new file mode 100644 index 0000000..6fd1506 --- /dev/null +++ b/amd-builtins/math32/all_native32.cl
@@ -0,0 +1,117 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +// HSAIL versions of native built-ins + +// HSAIL intrinsic functions +extern __attribute__((pure)) float __hsail_ncos_f32(float); +extern __attribute__((pure)) float __hsail_nexp2_f32(float); +extern __attribute__((pure)) float __hsail_nlog2_f32(float); +extern __attribute__((pure)) float __hsail_nrcp_f32(float); +extern __attribute__((pure)) float __hsail_nrsqrt_f32(float); +extern __attribute__((pure)) float __hsail_nsin_f32(float); +extern __attribute__((pure)) float __hsail_nsqrt_f32(float); + +// Value of log2(10) +#define M_LOG2_10_F 3.32192809488736f +// Value of 1 / log2(10) +#define M_RLOG2_10_F 0.30102999566398f +// Value of 1 / M_LOG2E_F = 1 / log2(e) +#define M_RLOG2_E_F 0.69314718055995f + + +__attribute__((overloadable, always_inline)) float +native_cos(float x) { + return __hsail_ncos_f32(x); +} + +__attribute__((overloadable, always_inline)) float +native_divide(float x, float y) { + return native_recip(y)*x; +} + +__attribute__((overloadable, always_inline)) float +native_exp2(float x) { + return __hsail_nexp2_f32(x); +} + +__attribute__((overloadable,weak,always_inline)) float +native_exp(float f) { + // There is no native exp in HSAIL, but we have exp2 instruction. + return __hsail_nexp2_f32(M_LOG2E_F*f); +} + +__attribute__((overloadable,weak,always_inline)) float +native_exp10(float f) { + // There is no native exp10 in HSAIL, but we have exp2 instruction. + return __hsail_nexp2_f32(M_LOG2_10_F*f); +} + +__attribute__((overloadable, always_inline)) float +native_log2(float x) { + return __hsail_nlog2_f32(x); +} + +__attribute__((overloadable,weak,always_inline)) float +native_log(float f) { + // There is no native log in HSAIL, but we have log2 instruction. + return __hsail_nlog2_f32(f)*M_RLOG2_E_F; +} + +__attribute__((overloadable,weak,always_inline)) float +native_log10(float f) { + // There is no native log10 in HSAIL, but we have log2 instruction. + return __hsail_nlog2_f32(f)*M_RLOG2_10_F; +} + +__attribute__((overloadable, always_inline)) float +native_powr(float x, float y) +{ + return native_exp2(native_log2(x)*y); +} + +__attribute__((overloadable, always_inline)) float +native_recip(float x) { + return __hsail_nrcp_f32(x); +} + +__attribute__((overloadable, always_inline)) float +native_rsqrt(float x) +{ + return __hsail_nrsqrt_f32(x); +} + +__attribute__((overloadable, always_inline)) float +native_sin(float x) { + return __hsail_nsin_f32(x); +} + +__attribute__((overloadable, always_inline)) float +native_sqrt(float x) { + return __hsail_nsqrt_f32(x); +} + +extern __attribute__((pure)) float __amdil_tan_f32(float,float); +__attribute__((overloadable, always_inline)) float +native_tan(float x) +{ + return native_sin(x)*native_recip(native_cos(x)); +}
diff --git a/amd-builtins/math32/asinF.cl b/amd-builtins/math32/asinF.cl new file mode 100644 index 0000000..58c3f57 --- /dev/null +++ b/amd-builtins/math32/asinF.cl
@@ -0,0 +1,87 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +asin(float x) +{ + // Computes arcsin(x). + // The argument is first reduced by noting that arcsin(x) + // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). + // For denormal and small arguments arcsin(x) = x to machine + // accuracy. Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arcsin(x) = x + x^3*R(x^2) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + + const float piby2_tail = 7.5497894159e-08F; /* 0x33a22168 */ + const float hpiby2_head = 7.8539812565e-01F; /* 0x3f490fda */ + const float piby2 = 1.5707963705e+00F; /* 0x3fc90fdb */ + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint xs = ux ^ aux; + float spiby2 = as_float(xs | as_uint(piby2)); + int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + float y = as_float(aux); + + // abs(x) >= 0.5 + int transform = xexp >= -1; + + float y2 = y * y; + float rt = 0.5f * (1.0f - y); + float r = transform ? rt : y2; + + // Use a rational approximation for [0.0, 0.5] + float a = mad(r, + mad(r, + mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F), + -0.0565298683201845211985026327361F), + 0.184161606965100694821398249421F); + + float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F); + float u = r * MATH_DIVIDE(a, b); + + float s = MATH_SQRT(r); + float s1 = as_float(as_uint(s) & 0xffff0000); + float c = MATH_DIVIDE(mad(-s1, s1, r), s + s1); + float p = mad(2.0f*s, u, -mad(c, -2.0f, piby2_tail)); + float q = mad(s1, -2.0f, hpiby2_head); + float vt = hpiby2_head - (p - q); + float v = mad(y, u, y); + v = transform ? vt : v; + + float ret = as_float(xs | as_uint(v)); + ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret; + ret = aux == 0x3f800000U ? spiby2 : ret; + ret = xexp < -14 ? x : ret; + + return ret; +} +
diff --git a/amd-builtins/math32/asinhF.cl b/amd-builtins/math32/asinhF.cl new file mode 100644 index 0000000..45c5d1f --- /dev/null +++ b/amd-builtins/math32/asinhF.cl
@@ -0,0 +1,72 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +asinh(float x) +{ + uint ux = as_uint(x); + uint ax = ux & EXSIGNBIT_SP32; + uint xsgn = ax ^ ux; + + // |x| <= 2 + float t = x * x; + float a = mad(t, + mad(t, + mad(t, + mad(t, -1.177198915954942694e-4f, -4.162727710583425360e-2f), + -5.063201055468483248e-1f), + -1.480204186473758321f), + -1.152965835871758072f); + float b = mad(t, + mad(t, + mad(t, + mad(t, 6.284381367285534560e-2f, 1.260024978680227945f), + 6.582362487198468066f), + 11.99423176003939087f), + 6.917795026025976739f); + + float q = MATH_DIVIDE(a, b); + float z1 = mad(x*t, q, x); + + // |x| > 2 + + // Arguments greater than 1/sqrt(epsilon) in magnitude are + // approximated by asinh(x) = ln(2) + ln(abs(x)), with sign of x + // Arguments such that 4.0 <= abs(x) <= 1/sqrt(epsilon) are + // approximated by asinhf(x) = ln(abs(x) + sqrt(x*x+1)) + // with the sign of x (see Abramowitz and Stegun 4.6.20) + + float absx = as_float(ax); + int hi = ax > 0x46000000U; + float y = MATH_SQRT(absx * absx + 1.0f) + absx; + y = hi ? absx : y; + float r = log(y) + (hi ? 0x1.62e430p-1f : 0.0f); + float z2 = as_float(xsgn | as_uint(r)); + + float z = ax <= 0x40000000 ? z1 : z2; + z = ax < 0x39800000U | ax >= PINFBITPATT_SP32 ? x : z; + + return z; +} +
diff --git a/amd-builtins/math32/asinpiF.cl b/amd-builtins/math32/asinpiF.cl new file mode 100644 index 0000000..009cdf4 --- /dev/null +++ b/amd-builtins/math32/asinpiF.cl
@@ -0,0 +1,90 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +asinpi(float x) +{ + // Computes arcsin(x). + // The argument is first reduced by noting that arcsin(x) + // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). + // For denormal and small arguments arcsin(x) = x to machine + // accuracy. Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arcsin(x) = x + x^3*R(x^2) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + + const float pi = 3.1415926535897933e+00f; + const float piby2_tail = 7.5497894159e-08F; /* 0x33a22168 */ + const float hpiby2_head = 7.8539812565e-01F; /* 0x3f490fda */ + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint xs = ux ^ aux; + float shalf = as_float(xs | as_uint(0.5f)); + + int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + + float y = as_float(aux); + + // abs(x) >= 0.5 + int transform = xexp >= -1; + + float y2 = y * y; + float rt = 0.5f * (1.0f - y); + float r = transform ? rt : y2; + + // Use a rational approximation for [0.0, 0.5] + float a = mad(r, + mad(r, + mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F), + -0.0565298683201845211985026327361F), + 0.184161606965100694821398249421F); + float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F); + float u = r * MATH_DIVIDE(a, b); + + float s = MATH_SQRT(r); + float s1 = as_float(as_uint(s) & 0xffff0000); + float c = MATH_DIVIDE(mad(-s1, s1, r), s + s1); + float p = mad(2.0f*s, u, -mad(c, -2.0f, piby2_tail)); + float q = mad(s1, -2.0f, hpiby2_head); + float vt = hpiby2_head - (p - q); + float v = mad(y, u, y); + v = transform ? vt : v; + v = MATH_DIVIDE(v, pi); + float xbypi = MATH_DIVIDE(x, pi); + + float ret = as_float(xs | as_uint(v)); + ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret; + ret = aux == 0x3f800000U ? shalf : ret; + ret = xexp < -14 ? xbypi : ret; + + return ret; +} +
diff --git a/amd-builtins/math32/atan2F.cl b/amd-builtins/math32/atan2F.cl new file mode 100644 index 0000000..82b3ac1 --- /dev/null +++ b/amd-builtins/math32/atan2F.cl
@@ -0,0 +1,158 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +#ifndef TABLE_BASED_ATAN2 +__attribute__((overloadable)) float +atan2(float y, float x) +{ + const float pi = 0x1.921fb6p+1f; + const float piby2 = 0x1.921fb6p+0; + const float piby4 = 0x1.921fb6p-1f; + const float threepiby4 = 0x1.2d97c8p+1f; + + float ax = fabs(x); + float ay = fabs(y); + float v = min(ax, ay); + float u = max(ax, ay); + + // Scale since u could be large, as in "regular" divide + float s = u > 0x1.0p+96f ? 0x1.0p-32 : 1.0f; + float vbyu = s * MATH_DIVIDE(v, s*u); + + float vbyu2 = vbyu * vbyu; + +#define USE_2_2_APPROXIMATION +#if defined USE_2_2_APPROXIMATION + float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu; + float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f); +#else + float p = mad(vbyu2, mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), -0x1.55554ep-2f) * vbyu2 * vbyu; + float q = mad(vbyu2, mad(vbyu2, mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), 0x1.76b4b8p+0f), 1.0f); +#endif + + // Octant 0 result + float a = mad(p, MATH_RECIP(q), vbyu); + + // Fix up 3 other octants + float at = piby2 - a; + a = ay > ax ? at : a; + at = pi - a; + a = x < 0.0F ? at : a; + + // y == 0 => 0 for x >= 0, pi for x < 0 + at = as_int(x) < 0 ? pi : 0.0f; + a = y == 0.0f ? at : a; + + // if (!FINITE_ONLY()) { + // x and y are +- Inf + at = x > 0.0f ? piby4 : threepiby4; + a = ax == INFINITY & ay == INFINITY ? at : a; + + // x or y is NaN + a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a; + // } + + // Fixup sign and return + return copysign(a, y); +} +#else +__attribute__((overloadable)) float +atan2(float y, float x) +{ + USE_TABLE(float, p_tbl, M32_ATAN2_JBY256); + + // Explicitly flush arguments + x = FTZ(x); + y = FTZ(y); + + uint uy = as_uint(y); + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint auy = uy & EXSIGNBIT_SP32; + + // General case: take absolute values of arguments + float u = as_float(aux); + float v = as_float(auy); + + // Swap u and v if necessary to obtain 0 < v < u + int swap_vu = u < v; + float uu = u; + u = swap_vu ? v : u; + v = swap_vu ? uu : v; + + // Use full range division here because the reciprocal of u could be subnormal + float vbyu = v / u; + + // Handle large quotient with table and polynomial approximation + int big = vbyu > 0.0625f; + + int index = (int) mad(vbyu, 256.0f, 0.5f); + float findex = (float)index; + float r = MATH_DIVIDE(mad(vbyu, 256.0f, -findex), mad(vbyu, findex, 256.0f)); + float s = r * r; + index = clamp(index-16, 0, 240); + float qbig = mad(r*s, -0.33333333333224095522f, r) + p_tbl[index]; + + // Handle small quotient with a series expansion + s = vbyu * vbyu; + float q = mad(s, -mad(s, -0.14285713561807169030f, 0.19999999999393223405f), 0.33333333333333170500f); + q = mad(vbyu*s, -q, vbyu); + q = big ? qbig : q; + + // Tidy-up according to which quadrant the arguments lie in + const float piby2 = 1.5707963267948966e+00f; + float qt = piby2 - q; + q = swap_vu ? qt : q; + + int xneg = ux != aux; + const float pi = 3.1415926535897932e+00f; + qt = pi - q; + q = xneg ? qt : q; + + uint ysign = uy ^ auy; + q = as_float(ysign | as_uint(q)); + + // Now handle a few special cases + // Zero y gives +-0 for positive x and +-pi for negative x + qt = as_float(ysign | as_uint(pi)); + qt = xneg ? qt : y; + q = y == 0.0f ? qt : q; + + if (!FINITE_ONLY()) { + // If abs(x) and abs(y) are both infinity return +-pi/4 or +- 3pi/4 according to signs + const float piby4 = 7.8539816339744831e-01f; + const float three_piby4 = 2.3561944901923449e+00f; + qt = xneg ? three_piby4 : piby4; + qt = as_float(ysign | as_uint(qt)); + q = auy == PINFBITPATT_SP32 & aux == PINFBITPATT_SP32 ? qt : q; + + // If either arg was NaN, return it + q = aux > PINFBITPATT_SP32 ? x : q; + q = auy > PINFBITPATT_SP32 ? y : q; + } + + return q; +} +#endif +
diff --git a/amd-builtins/math32/atan2F_table.h b/amd-builtins/math32/atan2F_table.h new file mode 100644 index 0000000..e46527a --- /dev/null +++ b/amd-builtins/math32/atan2F_table.h
@@ -0,0 +1,268 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/* Array atan_jby256 contains precomputed values of atan(j/256), +for j = 16, 17, ..., 256. */ +DECLARE_TABLE(float, ATAN2_TABLE_JBY256, 241, + 6.24188099959573430842e-02f, /* 0x3faff55bb72cfde9 */ + 6.63088949198234745008e-02f, /* 0x3fb0f99ea71d52a6 */ + 7.01969710718705064423e-02f, /* 0x3fb1f86dbf082d58 */ + 7.40829225490337306415e-02f, /* 0x3fb2f719318a4a9a */ + 7.79666338315423007588e-02f, /* 0x3fb3f59f0e7c559d */ + 8.18479898030765457007e-02f, /* 0x3fb4f3fd677292fb */ + 8.57268757707448092464e-02f, /* 0x3fb5f2324fd2d7b2 */ + 8.96031774848717321724e-02f, /* 0x3fb6f03bdcea4b0c */ + 9.34767811585894559112e-02f, /* 0x3fb7ee182602f10e */ + 9.73475734872236708739e-02f, /* 0x3fb8ebc54478fb28 */ + 1.01215441667466668485e-01f, /* 0x3fb9e94153cfdcf1 */ + 1.05080273416329528224e-01f, /* 0x3fbae68a71c722b8 */ + 1.08941956989865793015e-01f, /* 0x3fbbe39ebe6f07c3 */ + 1.12800381201659388752e-01f, /* 0x3fbce07c5c3cca32 */ + 1.16655435441069349478e-01f, /* 0x3fbddd21701eba6e */ + 1.20507009691224548087e-01f, /* 0x3fbed98c2190043a */ + 1.24354994546761424279e-01f, /* 0x3fbfd5ba9aac2f6d */ + 1.28199281231298117811e-01f, /* 0x3fc068d584212b3d */ + 1.32039761614638734288e-01f, /* 0x3fc0e6adccf40881 */ + 1.35876328229701304195e-01f, /* 0x3fc1646541060850 */ + 1.39708874289163620386e-01f, /* 0x3fc1e1fafb043726 */ + 1.43537293701821222491e-01f, /* 0x3fc25f6e171a535c */ + 1.47361481088651630200e-01f, /* 0x3fc2dcbdb2fba1ff */ + 1.51181331798580037562e-01f, /* 0x3fc359e8edeb99a3 */ + 1.54996741923940972718e-01f, /* 0x3fc3d6eee8c6626c */ + 1.58807608315631065832e-01f, /* 0x3fc453cec6092a9e */ + 1.62613828597948567589e-01f, /* 0x3fc4d087a9da4f17 */ + 1.66415301183114927586e-01f, /* 0x3fc54d18ba11570a */ + 1.70211925285474380276e-01f, /* 0x3fc5c9811e3ec269 */ + 1.74003600935367680469e-01f, /* 0x3fc645bfffb3aa73 */ + 1.77790228992676047071e-01f, /* 0x3fc6c1d4898933d8 */ + 1.81571711160032150945e-01f, /* 0x3fc73dbde8a7d201 */ + 1.85347949995694760705e-01f, /* 0x3fc7b97b4bce5b02 */ + 1.89118848926083965578e-01f, /* 0x3fc8350be398ebc7 */ + 1.92884312257974643856e-01f, /* 0x3fc8b06ee2879c28 */ + 1.96644245190344985064e-01f, /* 0x3fc92ba37d050271 */ + 2.00398553825878511514e-01f, /* 0x3fc9a6a8e96c8626 */ + 2.04147145182116990236e-01f, /* 0x3fca217e601081a5 */ + 2.07889927202262986272e-01f, /* 0x3fca9c231b403279 */ + 2.11626808765629753628e-01f, /* 0x3fcb1696574d780b */ + 2.15357699697738047551e-01f, /* 0x3fcb90d7529260a2 */ + 2.19082510780057748701e-01f, /* 0x3fcc0ae54d768466 */ + 2.22801153759394493514e-01f, /* 0x3fcc84bf8a742e6d */ + 2.26513541356919617664e-01f, /* 0x3fccfe654e1d5395 */ + 2.30219587276843717927e-01f, /* 0x3fcd77d5df205736 */ + 2.33919206214733416127e-01f, /* 0x3fcdf110864c9d9d */ + 2.37612313865471241892e-01f, /* 0x3fce6a148e96ec4d */ + 2.41298826930858800743e-01f, /* 0x3fcee2e1451d980c */ + 2.44978663126864143473e-01f, /* 0x3fcf5b75f92c80dd */ + 2.48651741190513253521e-01f, /* 0x3fcfd3d1fc40dbe4 */ + 2.52317980886427151166e-01f, /* 0x3fd025fa510665b5 */ + 2.55977303013005474952e-01f, /* 0x3fd061eea03d6290 */ + 2.59629629408257511791e-01f, /* 0x3fd09dc597d86362 */ + 2.63274882955282396590e-01f, /* 0x3fd0d97ee509acb3 */ + 2.66912987587400396539e-01f, /* 0x3fd1151a362431c9 */ + 2.70543868292936529052e-01f, /* 0x3fd150973a9ce546 */ + 2.74167451119658789338e-01f, /* 0x3fd18bf5a30bf178 */ + 2.77783663178873208022e-01f, /* 0x3fd1c735212dd883 */ + 2.81392432649178403370e-01f, /* 0x3fd2025567e47c95 */ + 2.84993688779881237938e-01f, /* 0x3fd23d562b381041 */ + 2.88587361894077354396e-01f, /* 0x3fd278372057ef45 */ + 2.92173383391398755471e-01f, /* 0x3fd2b2f7fd9b5fe2 */ + 2.95751685750431536626e-01f, /* 0x3fd2ed987a823cfe */ + 2.99322202530807379706e-01f, /* 0x3fd328184fb58951 */ + 3.02884868374971361060e-01f, /* 0x3fd362773707ebcb */ + 3.06439619009630070945e-01f, /* 0x3fd39cb4eb76157b */ + 3.09986391246883430384e-01f, /* 0x3fd3d6d129271134 */ + 3.13525122985043869228e-01f, /* 0x3fd410cbad6c7d32 */ + 3.17055753209146973237e-01f, /* 0x3fd44aa436c2af09 */ + 3.20578221991156986359e-01f, /* 0x3fd4845a84d0c21b */ + 3.24092470489871664618e-01f, /* 0x3fd4bdee586890e6 */ + 3.27598440950530811477e-01f, /* 0x3fd4f75f73869978 */ + 3.31096076704132047386e-01f, /* 0x3fd530ad9951cd49 */ + 3.34585322166458920545e-01f, /* 0x3fd569d88e1b4cd7 */ + 3.38066122836825466713e-01f, /* 0x3fd5a2e0175e0f4e */ + 3.41538425296541714449e-01f, /* 0x3fd5dbc3fbbe768d */ + 3.45002177207105076295e-01f, /* 0x3fd614840309cfe1 */ + 3.48457327308122011278e-01f, /* 0x3fd64d1ff635c1c5 */ + 3.51903825414964732676e-01f, /* 0x3fd685979f5fa6fd */ + 3.55341622416168290144e-01f, /* 0x3fd6bdeac9cbd76c */ + 3.58770670270572189509e-01f, /* 0x3fd6f61941e4def0 */ + 3.62190922004212156882e-01f, /* 0x3fd72e22d53aa2a9 */ + 3.65602331706966821034e-01f, /* 0x3fd7660752817501 */ + 3.69004854528964421068e-01f, /* 0x3fd79dc6899118d1 */ + 3.72398446676754202311e-01f, /* 0x3fd7d5604b63b3f7 */ + 3.75783065409248884237e-01f, /* 0x3fd80cd46a14b1d0 */ + 3.79158669033441808605e-01f, /* 0x3fd84422b8df95d7 */ + 3.82525216899905096124e-01f, /* 0x3fd87b4b0c1ebedb */ + 3.85882669398073752109e-01f, /* 0x3fd8b24d394a1b25 */ + 3.89230987951320717144e-01f, /* 0x3fd8e92916f5cde8 */ + 3.92570135011828580396e-01f, /* 0x3fd91fde7cd0c662 */ + 3.95900074055262896078e-01f, /* 0x3fd9566d43a34907 */ + 3.99220769575252543149e-01f, /* 0x3fd98cd5454d6b18 */ + 4.02532187077682512832e-01f, /* 0x3fd9c3165cc58107 */ + 4.05834293074804064450e-01f, /* 0x3fd9f93066168001 */ + 4.09127055079168300278e-01f, /* 0x3fda2f233e5e530b */ + 4.12410441597387267265e-01f, /* 0x3fda64eec3cc23fc */ + 4.15684422123729413467e-01f, /* 0x3fda9a92d59e98cf */ + 4.18948967133552840902e-01f, /* 0x3fdad00f5422058b */ + 4.22204048076583571270e-01f, /* 0x3fdb056420ae9343 */ + 4.25449637370042266227e-01f, /* 0x3fdb3a911da65c6c */ + 4.28685708391625730496e-01f, /* 0x3fdb6f962e737efb */ + 4.31912235472348193799e-01f, /* 0x3fdba473378624a5 */ + 4.35129193889246812521e-01f, /* 0x3fdbd9281e528191 */ + 4.38336559857957774877e-01f, /* 0x3fdc0db4c94ec9ef */ + 4.41534310525166673322e-01f, /* 0x3fdc42191ff11eb6 */ + 4.44722423960939305942e-01f, /* 0x3fdc76550aad71f8 */ + 4.47900879150937292206e-01f, /* 0x3fdcaa6872f3631b */ + 4.51069655988523443568e-01f, /* 0x3fdcde53432c1350 */ + 4.54228735266762495559e-01f, /* 0x3fdd121566b7f2ad */ + 4.57378098670320809571e-01f, /* 0x3fdd45aec9ec862b */ + 4.60517728767271039558e-01f, /* 0x3fdd791f5a1226f4 */ + 4.63647609000806093515e-01f, /* 0x3fddac670561bb4f */ + 4.66767723680866497560e-01f, /* 0x3fdddf85bb026974 */ + 4.69878057975686880265e-01f, /* 0x3fde127b6b0744af */ + 4.72978597903265574054e-01f, /* 0x3fde4548066cf51a */ + 4.76069330322761219421e-01f, /* 0x3fde77eb7f175a34 */ + 4.79150242925822533735e-01f, /* 0x3fdeaa65c7cf28c4 */ + 4.82221324227853687105e-01f, /* 0x3fdedcb6d43f8434 */ + 4.85282563559221225002e-01f, /* 0x3fdf0ede98f393cf */ + 4.88333951056405479729e-01f, /* 0x3fdf40dd0b541417 */ + 4.91375477653101910835e-01f, /* 0x3fdf72b221a4e495 */ + 4.94407135071275316562e-01f, /* 0x3fdfa45dd3029258 */ + 4.97428915812172245392e-01f, /* 0x3fdfd5e0175fdf83 */ + 5.00440813147294050189e-01f, /* 0x3fe0039c73c1a40b */ + 5.03442821109336358099e-01f, /* 0x3fe01c341e82422d */ + 5.06434934483096732549e-01f, /* 0x3fe034b709250488 */ + 5.09417148796356245022e-01f, /* 0x3fe04d25314342e5 */ + 5.12389460310737621107e-01f, /* 0x3fe0657e94db30cf */ + 5.15351866012543347040e-01f, /* 0x3fe07dc3324e9b38 */ + 5.18304363603577900044e-01f, /* 0x3fe095f30861a58f */ + 5.21246951491958210312e-01f, /* 0x3fe0ae0e1639866c */ + 5.24179628782913242802e-01f, /* 0x3fe0c6145b5b43da */ + 5.27102395269579471204e-01f, /* 0x3fe0de05d7aa6f7c */ + 5.30015251423793132268e-01f, /* 0x3fe0f5e28b67e295 */ + 5.32918198386882147055e-01f, /* 0x3fe10daa77307a0d */ + 5.35811237960463593311e-01f, /* 0x3fe1255d9bfbd2a8 */ + 5.38694372597246617929e-01f, /* 0x3fe13cfbfb1b056e */ + 5.41567605391844897333e-01f, /* 0x3fe1548596376469 */ + 5.44430940071603086672e-01f, /* 0x3fe16bfa6f5137e1 */ + 5.47284380987436924748e-01f, /* 0x3fe1835a88be7c13 */ + 5.50127933104692989907e-01f, /* 0x3fe19aa5e5299f99 */ + 5.52961601994028217888e-01f, /* 0x3fe1b1dc87904284 */ + 5.55785393822313511514e-01f, /* 0x3fe1c8fe7341f64f */ + 5.58599315343562330405e-01f, /* 0x3fe1e00babdefeb3 */ + 5.61403373889889367732e-01f, /* 0x3fe1f7043557138a */ + 5.64197577362497537656e-01f, /* 0x3fe20de813e823b1 */ + 5.66981934222700489912e-01f, /* 0x3fe224b74c1d192a */ + 5.69756453482978431069e-01f, /* 0x3fe23b71e2cc9e6a */ + 5.72521144698072359525e-01f, /* 0x3fe25217dd17e501 */ + 5.75276017956117824426e-01f, /* 0x3fe268a940696da6 */ + 5.78021083869819540801e-01f, /* 0x3fe27f261273d1b3 */ + 5.80756353567670302596e-01f, /* 0x3fe2958e59308e30 */ + 5.83481838685214859730e-01f, /* 0x3fe2abe21aded073 */ + 5.86197551356360535557e-01f, /* 0x3fe2c2215e024465 */ + 5.88903504204738026395e-01f, /* 0x3fe2d84c2961e48b */ + 5.91599710335111383941e-01f, /* 0x3fe2ee628406cbca */ + 5.94286183324841177367e-01f, /* 0x3fe30464753b090a */ + 5.96962937215401501234e-01f, /* 0x3fe31a52048874be */ + 5.99629986503951384336e-01f, /* 0x3fe3302b39b78856 */ + 6.02287346134964152178e-01f, /* 0x3fe345f01cce37bb */ + 6.04935031491913965951e-01f, /* 0x3fe35ba0b60eccce */ + 6.07573058389022313541e-01f, /* 0x3fe3713d0df6c503 */ + 6.10201443063065118722e-01f, /* 0x3fe386c52d3db11e */ + 6.12820202165241245673e-01f, /* 0x3fe39c391cd41719 */ + 6.15429352753104952356e-01f, /* 0x3fe3b198e5e2564a */ + 6.18028912282561737612e-01f, /* 0x3fe3c6e491c78dc4 */ + 6.20618898599929469384e-01f, /* 0x3fe3dc1c2a188504 */ + 6.23199329934065904268e-01f, /* 0x3fe3f13fb89e96f4 */ + 6.25770224888563042498e-01f, /* 0x3fe4064f47569f48 */ + 6.28331602434009650615e-01f, /* 0x3fe41b4ae06fea41 */ + 6.30883481900321840818e-01f, /* 0x3fe430328e4b26d5 */ + 6.33425882969144482537e-01f, /* 0x3fe445065b795b55 */ + 6.35958825666321447834e-01f, /* 0x3fe459c652badc7f */ + 6.38482330354437466191e-01f, /* 0x3fe46e727efe4715 */ + 6.40996417725432032775e-01f, /* 0x3fe4830aeb5f7bfd */ + 6.43501108793284370968e-01f, /* 0x3fe4978fa3269ee1 */ + 6.45996424886771558604e-01f, /* 0x3fe4ac00b1c71762 */ + 6.48482387642300484032e-01f, /* 0x3fe4c05e22de94e4 */ + 6.50959018996812410762e-01f, /* 0x3fe4d4a8023414e8 */ + 6.53426341180761927063e-01f, /* 0x3fe4e8de5bb6ec04 */ + 6.55884376711170835605e-01f, /* 0x3fe4fd013b7dd17e */ + 6.58333148384755983962e-01f, /* 0x3fe51110adc5ed81 */ + 6.60772679271132590273e-01f, /* 0x3fe5250cbef1e9fa */ + 6.63202992706093175102e-01f, /* 0x3fe538f57b89061e */ + 6.65624112284960989250e-01f, /* 0x3fe54ccaf0362c8f */ + 6.68036061856020157990e-01f, /* 0x3fe5608d29c70c34 */ + 6.70438865514021320458e-01f, /* 0x3fe5743c352b33b9 */ + 6.72832547593763097282e-01f, /* 0x3fe587d81f732fba */ + 6.75217132663749830535e-01f, /* 0x3fe59b60f5cfab9d */ + 6.77592645519925151909e-01f, /* 0x3fe5aed6c5909517 */ + 6.79959111179481823228e-01f, /* 0x3fe5c2399c244260 */ + 6.82316554874748071313e-01f, /* 0x3fe5d58987169b18 */ + 6.84665002047148862907e-01f, /* 0x3fe5e8c6941043cf */ + 6.87004478341244895212e-01f, /* 0x3fe5fbf0d0d5cc49 */ + 6.89335009598845749323e-01f, /* 0x3fe60f084b46e05e */ + 6.91656621853199760075e-01f, /* 0x3fe6220d115d7b8d */ + 6.93969341323259825138e-01f, /* 0x3fe634ff312d1f3b */ + 6.96273194408023488045e-01f, /* 0x3fe647deb8e20b8f */ + 6.98568207680949848637e-01f, /* 0x3fe65aabb6c07b02 */ + 7.00854407884450081312e-01f, /* 0x3fe66d663923e086 */ + 7.03131821924453670469e-01f, /* 0x3fe6800e4e7e2857 */ + 7.05400476865049030906e-01f, /* 0x3fe692a40556fb6a */ + 7.07660399923197958039e-01f, /* 0x3fe6a5276c4b0575 */ + 7.09911618463524796141e-01f, /* 0x3fe6b798920b3d98 */ + 7.12154159993178659249e-01f, /* 0x3fe6c9f7855c3198 */ + 7.14388052156768926793e-01f, /* 0x3fe6dc44551553ae */ + 7.16613322731374569052e-01f, /* 0x3fe6ee7f10204aef */ + 7.18829999621624415873e-01f, /* 0x3fe700a7c5784633 */ + 7.21038110854851588272e-01f, /* 0x3fe712be84295198 */ + 7.23237684576317874097e-01f, /* 0x3fe724c35b4fae7b */ + 7.25428749044510712274e-01f, /* 0x3fe736b65a172dff */ + 7.27611332626510676214e-01f, /* 0x3fe748978fba8e0f */ + 7.29785463793429123314e-01f, /* 0x3fe75a670b82d8d8 */ + 7.31951171115916565668e-01f, /* 0x3fe76c24dcc6c6c0 */ + 7.34108483259739652560e-01f, /* 0x3fe77dd112ea22c7 */ + 7.36257428981428097003e-01f, /* 0x3fe78f6bbd5d315e */ + 7.38398037123989547936e-01f, /* 0x3fe7a0f4eb9c19a2 */ + 7.40530336612692630105e-01f, /* 0x3fe7b26cad2e50fd */ + 7.42654356450917929600e-01f, /* 0x3fe7c3d311a6092b */ + 7.44770125716075148681e-01f, /* 0x3fe7d528289fa093 */ + 7.46877673555587429099e-01f, /* 0x3fe7e66c01c114fd */ + 7.48977029182941400620e-01f, /* 0x3fe7f79eacb97898 */ + 7.51068221873802288613e-01f, /* 0x3fe808c03940694a */ + 7.53151280962194302759e-01f, /* 0x3fe819d0b7158a4c */ + 7.55226235836744863583e-01f, /* 0x3fe82ad036000005 */ + 7.57293115936992444759e-01f, /* 0x3fe83bbec5cdee22 */ + 7.59351950749757920178e-01f, /* 0x3fe84c9c7653f7ea */ + 7.61402769805578416573e-01f, /* 0x3fe85d69576cc2c5 */ + 7.63445602675201784315e-01f, /* 0x3fe86e2578f87ae5 */ + 7.65480478966144461950e-01f, /* 0x3fe87ed0eadc5a2a */ + 7.67507428319308182552e-01f, /* 0x3fe88f6bbd023118 */ + 7.69526480405658186434e-01f, /* 0x3fe89ff5ff57f1f7 */ + 7.71537664922959498526e-01f, /* 0x3fe8b06fc1cf3dfe */ + 7.73541011592573490852e-01f, /* 0x3fe8c0d9145cf49d */ + 7.75536550156311621507e-01f, /* 0x3fe8d13206f8c4ca */ + 7.77524310373347682379e-01f, /* 0x3fe8e17aa99cc05d */ + 7.79504322017186335181e-01f, /* 0x3fe8f1b30c44f167 */ + 7.81476614872688268854e-01f, /* 0x3fe901db3eeef187 */ + 7.83441218733151756304e-01f, /* 0x3fe911f35199833b */ + 7.85398163397448278999e-01f, /* 0x3fe921fb54442d18 */ +) +
diff --git a/amd-builtins/math32/atan2piF.cl b/amd-builtins/math32/atan2piF.cl new file mode 100644 index 0000000..372e31f --- /dev/null +++ b/amd-builtins/math32/atan2piF.cl
@@ -0,0 +1,153 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +#ifndef TABLE_BASED_ATAN2 +__attribute__((overloadable)) float +atan2pi(float y, float x) +{ + const float pi = 0x1.921fb6p+1f; + + float ax = fabs(x); + float ay = fabs(y); + float v = min(ax, ay); + float u = max(ax, ay); + + // Scale since u could be large, as in "regular" divide + float s = u > 0x1.0p+96f ? 0x1.0p-32 : 1.0f; + float vbyu = s * MATH_DIVIDE(v, s*u); + + float vbyu2 = vbyu * vbyu; + +#define USE_2_2_APPROXIMATION +#if defined USE_2_2_APPROXIMATION + float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu; + float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f); +#else + float p = mad(vbyu2, mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), -0x1.55554ep-2f) * vbyu2 * vbyu; + float q = mad(vbyu2, mad(vbyu2, mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), 0x1.76b4b8p+0f), 1.0f); +#endif + + // Octant 0 result + float a = MATH_DIVIDE(mad(p, MATH_RECIP(q), vbyu), pi); + + // Fix up 3 other octants + float at = 0.5f - a; + a = ay > ax ? at : a; + at = 1.0f - a; + a = x < 0.0F ? at : a; + + // y == 0 => 0 for x >= 0, pi for x < 0 + at = as_int(x) < 0 ? 1.0f : 0.0f; + a = y == 0.0f ? at : a; + + // if (!FINITE_ONLY()) { + // x and y are +- Inf + at = x > 0.0f ? 0.25f : 0.75f; + a = ax == INFINITY & ay == INFINITY ? at : a; + + // x or y is NaN + a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a; + // } + + // Fixup sign and return + return copysign(a, y); +} +#else +__attribute__((overloadable)) float +atan2pi(float y, float x) +{ + USE_TABLE(float, p_tbl, M32_ATAN2_JBY256); + + // Explicitly flush arguments + x = FTZ(x); + y = FTZ(y); + + uint uy = as_uint(y); + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint auy = uy & EXSIGNBIT_SP32; + + // General case: take absolute values of arguments + float u = as_float(aux); + float v = as_float(auy); + + // Swap u and v if necessary to obtain 0 < v < u + int swap_vu = u < v; + float uu = u; + u = swap_vu ? v : u; + v = swap_vu ? uu : v; + + // Use full range division here because the reciprocal of u could be subnormal + float vbyu = v / u; + + // Handle large quotient with table and polynomial approximation + int big = vbyu > 0.0625f; + + int index = (int) mad(vbyu, 256.0f, 0.5f); + float findex = (float)index; + float r = MATH_DIVIDE(mad(vbyu, 256.0f, -findex), mad(vbyu, findex, 256.0f)); + float s = r * r; + index = clamp(index-16, 0, 240); + float qbig = mad(r*s, -0.33333333333224095522f, r) + p_tbl[index]; + + // Handle small quotient with a series expansion + s = vbyu * vbyu; + float q = mad(s, -mad(s, -0.14285713561807169030f, 0.19999999999393223405f), 0.33333333333333170500f); + q = mad(vbyu*s, -q, vbyu); + q = big ? qbig : q; + + const float pi = 3.1415926535897932e+00f; + q = MATH_DIVIDE(q, pi); + + // Tidy-up according to which quadrant the arguments lie in + float qt = 0.5f - q; + q = swap_vu ? qt : q; + + int xneg = ux != aux; + qt = 1.0f - q; + q = xneg ? qt : q; + + uint ysign = uy ^ auy; + q = as_float(ysign | as_uint(q)); + + // Now handle a few special cases + // Zero y gives +-0 for positive x and +-pi for negative x + qt = as_float(ysign | 0x3f800000); + qt = xneg ? qt : y; + q = y == 0.0f ? qt : q; + + if (!FINITE_ONLY()) { + // If abs(x) and abs(y) are both infinity return +-pi/4 or +- 3pi/4 according to signs + qt = xneg ? 0.75f : 0.25f; + qt = as_float(ysign | as_uint(qt)); + q = auy == PINFBITPATT_SP32 & aux == PINFBITPATT_SP32 ? qt : q; + + // If either arg was NaN, return it + q = aux > PINFBITPATT_SP32 ? x : q; + q = auy > PINFBITPATT_SP32 ? y : q; + } + + return q; +} +#endif
diff --git a/amd-builtins/math32/atanF.cl b/amd-builtins/math32/atanF.cl new file mode 100644 index 0000000..81bac63 --- /dev/null +++ b/amd-builtins/math32/atanF.cl
@@ -0,0 +1,95 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +atan(float x) +{ + const float piby2 = 1.5707963267948966f; // 0x3ff921fb54442d18 + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint sx = ux ^ aux; + + float spiby2 = as_float(sx | as_uint(piby2)); + + float v = as_float(aux); + + // Return for NaN + float ret = x; + + // 2^26 <= |x| <= Inf => atan(x) is close to piby2 + ret = aux <= PINFBITPATT_SP32 ? spiby2 : ret; + + // Reduce arguments 2^-19 <= |x| < 2^26 + + // 39/16 <= x < 2^26 + x = -MATH_RECIP(v); + float c = 1.57079632679489655800f; // atan(infinity) + + // 19/16 <= x < 39/16 + int l = aux < 0x401c0000; + float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f)); + x = l ? xx : x; + c = l ? 9.82793723247329054082e-01f : c; // atan(1.5) + + // 11/16 <= x < 19/16 + l = aux < 0x3f980000U; + xx = MATH_DIVIDE(v - 1.0f, 1.0f + v); + x = l ? xx : x; + c = l ? 7.85398163397448278999e-01f : c; // atan(1) + + // 7/16 <= x < 11/16 + l = aux < 0x3f300000; + xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v); + x = l ? xx : x; + c = l ? 4.63647609000806093515e-01f : c; // atan(0.5) + + // 2^-19 <= x < 7/16 + l = aux < 0x3ee00000; + x = l ? v : x; + c = l ? 0.0f : c; + + // Core approximation: Remez(2,2) on [-7/16,7/16] + + float s = x * x; + float a = mad(s, + mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f), + 0.296528598819239217902158651186f); + + float b = mad(s, + mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f), + 0.889585796862432286486651434570f); + + float q = x * s * MATH_DIVIDE(a, b); + + float z = c - (q - x); + float zs = as_float(sx | as_uint(z)); + + ret = aux < 0x4c800000 ? zs : ret; + + // |x| < 2^-19 + ret = aux < 0x36000000 ? as_float(ux) : ret; + return ret; +} +
diff --git a/amd-builtins/math32/atanhF.cl b/amd-builtins/math32/atanhF.cl new file mode 100644 index 0000000..ca78c58 --- /dev/null +++ b/amd-builtins/math32/atanhF.cl
@@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +atanh(float x) +{ + uint ux = as_uint(x); + uint ax = ux & EXSIGNBIT_SP32; + uint xs = ux ^ ax; + + // |x| > 1 or NaN + float z = as_float(QNANBITPATT_SP32); + + // |x| == 1 + float t = as_float(xs | PINFBITPATT_SP32); + z = ax == 0x3f800000U ? t : z; + + // 1/2 <= |x| < 1 + t = as_float(ax); + t = MATH_DIVIDE(2.0f*t, 1.0f - t); + t = 0.5f * log1p(t); + t = as_float(xs | as_uint(t)); + z = ax < 0x3f800000U ? t : z; + + // |x| < 1/2 + t = x * x; + float a = mad(mad(0.92834212715e-2f, t, -0.28120347286e0f), t, 0.39453629046e0f); + float b = mad(mad(0.45281890445e0f, t, -0.15537744551e1f), t, 0.11836088638e1f); + float p = MATH_DIVIDE(a, b); + t = mad(x*t, p, x); + z = ax < 0x3f000000 ? t : z; + + // |x| < 2^-13 + z = ax < 0x39000000U ? x : z; + + return z; +} +
diff --git a/amd-builtins/math32/atanpiF.cl b/amd-builtins/math32/atanpiF.cl new file mode 100644 index 0000000..064554f --- /dev/null +++ b/amd-builtins/math32/atanpiF.cl
@@ -0,0 +1,97 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +atanpi(float x) +{ + const float pi = 3.1415926535897932f; + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint sx = ux ^ aux; + + float xbypi = MATH_DIVIDE(x, pi); + float shalf = as_float(sx | as_uint(0.5f)); + + float v = as_float(aux); + + // Return for NaN + float ret = x; + + // 2^26 <= |x| <= Inf => atan(x) is close to piby2 + ret = aux <= PINFBITPATT_SP32 ? shalf : ret; + + // Reduce arguments 2^-19 <= |x| < 2^26 + + // 39/16 <= x < 2^26 + x = -MATH_RECIP(v); + float c = 1.57079632679489655800f; // atan(infinity) + + // 19/16 <= x < 39/16 + int l = aux < 0x401c0000; + float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f)); + x = l ? xx : x; + c = l ? 9.82793723247329054082e-01f : c; // atan(1.5) + + // 11/16 <= x < 19/16 + l = aux < 0x3f980000U; + xx = MATH_DIVIDE(v - 1.0f, 1.0f + v); + x = l ? xx : x; + c = l ? 7.85398163397448278999e-01f : c; // atan(1) + + // 7/16 <= x < 11/16 + l = aux < 0x3f300000; + xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v); + x = l ? xx : x; + c = l ? 4.63647609000806093515e-01f : c; // atan(0.5) + + // 2^-19 <= x < 7/16 + l = aux < 0x3ee00000; + x = l ? v : x; + c = l ? 0.0f : c; + + // Core approximation: Remez(2,2) on [-7/16,7/16] + + float s = x * x; + float a = mad(s, + mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f), + 0.296528598819239217902158651186f); + + float b = mad(s, + mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f), + 0.889585796862432286486651434570f); + + float q = x * s * MATH_DIVIDE(a, b); + + float z = c - (q - x); + z = MATH_DIVIDE(z, pi); + float zs = as_float(sx | as_uint(z)); + + ret = aux < 0x4c800000 ? zs : ret; + + // |x| < 2^-19 + ret = aux < 0x36000000 ? xbypi : ret; + return ret; +} +
diff --git a/amd-builtins/math32/cbrtF.cl b/amd-builtins/math32/cbrtF.cl new file mode 100644 index 0000000..968d504 --- /dev/null +++ b/amd-builtins/math32/cbrtF.cl
@@ -0,0 +1,114 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +// Algorithm: +// +// x = (2^m)*A +// x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8)) +// x = (2^m)*2*(G/2+g/2) +// x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9)) +// +// Y = (2^(-1))*(2^(-m))*(2^m)*A +// Now, range of Y is: 0.5 <= Y < 1 +// +// F = 0x100 + (first 7 mantissa bits) + (8th mantissa bit) +// Now, range of F is: 128 <= F <= 256 +// F = F / 256 +// Now, range of F is: 0.5 <= F <= 1 +// +// f = (Y-F), with (f <= 2^(-9)) +// +// cbrt(x) = cbrt(2^m) * cbrt(2) * cbrt(F+f) +// cbrt(x) = cbrt(2^m) * cbrt(2) * cbrt(F) + cbrt(1+(f/F)) +// cbrt(x) = cbrt(2^m) * cbrt(2*F) * cbrt(1+r) +// +// r = (f/F), with (r <= 2^(-8)) +// r = f*(1/F) with (1/F) precomputed to avoid division +// +// cbrt(x) = cbrt(2^m) * cbrt(G) * (1+poly) +// +// poly = c1*r + c2*(r^2) + c3*(r^3) + c4*(r^4) + c5*(r^5) + c6*(r^6) + +__attribute__((overloadable)) float +cbrt(float x) +{ + USE_TABLE(float2, p_cbrt, CBRT_TBL); + USE_TABLE(float, p_log_inv, LOG_INV_TBL); + + uint xi = as_uint(x); + uint axi = xi & EXSIGNBIT_SP32; + uint xsign = axi ^ xi; + xi = axi; + + int m = (xi >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + + // Treat subnormals + uint xisub = as_uint(as_float(xi | 0x3f800000) - 1.0f); + int msub = (xisub >> EXPSHIFTBITS_SP32) - 253; + int c = m == -127; + xi = c ? xisub : xi; + m = c ? msub : m; + + int m3 = m / 3; + int rem = m - m3*3; + float mf = as_float((m3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + + uint indx = (xi & 0x007f0000) + ((xi & 0x00008000) << 1); + float f = as_float((xi & MANTBITS_SP32) | 0x3f000000) - as_float(indx | 0x3f000000); + + indx >>= 16; + float r = f * p_log_inv[indx]; + float poly = mad(mad(r, 0x1.f9add4p-5f, -0x1.c71c72p-4f), r*r, r * 0x1.555556p-2f); + + // This could also be done with a 5-element table + float remH = 0x1.428000p-1f; + float remT = 0x1.45f31ap-14f; + + remH = rem == -1 ? 0x1.964000p-1f : remH; + remT = rem == -1 ? 0x1.fea53ep-13f : remT; + + remH = rem == 0 ? 0x1.000000p+0f : remH; + remT = rem == 0 ? 0x0.000000p+0f : remT; + + remH = rem == 1 ? 0x1.428000p+0f : remH; + remT = rem == 1 ? 0x1.45f31ap-13f : remT; + + remH = rem == 2 ? 0x1.964000p+0f : remH; + remT = rem == 2 ? 0x1.fea53ep-12f : remT; + + float2 tv = p_cbrt[indx]; + float cbrtH = tv.s0; + float cbrtT = tv.s1; + + float bH = cbrtH * remH; + float bT = mad(cbrtH, remT, mad(cbrtT, remH, cbrtT*remT)); + + float z = mad(poly, bH, mad(poly, bT, bT)) + bH; + z *= mf; + z = as_float(as_uint(z) | xsign); + c = axi >= EXPBITS_SP32 | axi == 0; + z = c ? x : z; + return z; +} +
diff --git a/amd-builtins/math32/cbrtF_table.h b/amd-builtins/math32/cbrtF_table.h new file mode 100644 index 0000000..209d2b2 --- /dev/null +++ b/amd-builtins/math32/cbrtF_table.h
@@ -0,0 +1,154 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +DECLARE_TABLE(float2, CBRT_TBL, 129, + (float2)(0x1.000000p+0f, 0x0.000000p+0f), + (float2)(0x1.008000p+0f, 0x1.51cb0ap-11f), + (float2)(0x1.014000p+0f, 0x1.39221ep-12f), + (float2)(0x1.01c000p+0f, 0x1.e06908p-11f), + (float2)(0x1.028000p+0f, 0x1.1d6978p-11f), + (float2)(0x1.034000p+0f, 0x1.4ea1bep-13f), + (float2)(0x1.03c000p+0f, 0x1.833b8ep-11f), + (float2)(0x1.048000p+0f, 0x1.587002p-12f), + (float2)(0x1.050000p+0f, 0x1.ceb290p-11f), + (float2)(0x1.05c000p+0f, 0x1.d57f34p-12f), + (float2)(0x1.068000p+0f, 0x1.cc53acp-21f), + (float2)(0x1.070000p+0f, 0x1.0fe098p-11f), + (float2)(0x1.07c000p+0f, 0x1.91b586p-15f), + (float2)(0x1.084000p+0f, 0x1.1c362ep-11f), + (float2)(0x1.090000p+0f, 0x1.94398ep-15f), + (float2)(0x1.098000p+0f, 0x1.1055bcp-11f), + (float2)(0x1.0a4000p+0f, 0x1.7e63cap-19f), + (float2)(0x1.0ac000p+0f, 0x1.d99e1ap-12f), + (float2)(0x1.0b4000p+0f, 0x1.d258dep-11f), + (float2)(0x1.0c0000p+0f, 0x1.645962p-12f), + (float2)(0x1.0c8000p+0f, 0x1.8c5b0ep-11f), + (float2)(0x1.0d4000p+0f, 0x1.83d0c8p-13f), + (float2)(0x1.0dc000p+0f, 0x1.300812p-11f), + (float2)(0x1.0e4000p+0f, 0x1.f9a65ap-11f), + (float2)(0x1.0f0000p+0f, 0x1.7bbcd8p-12f), + (float2)(0x1.0f8000p+0f, 0x1.7cbf68p-11f), + (float2)(0x1.104000p+0f, 0x1.b2c166p-14f), + (float2)(0x1.10c000p+0f, 0x1.d56ea4p-12f), + (float2)(0x1.114000p+0f, 0x1.99eb32p-11f), + (float2)(0x1.120000p+0f, 0x1.1007a2p-13f), + (float2)(0x1.128000p+0f, 0x1.d212aap-12f), + (float2)(0x1.130000p+0f, 0x1.890f18p-11f), + (float2)(0x1.13c000p+0f, 0x1.2104e2p-14f), + (float2)(0x1.144000p+0f, 0x1.74961ep-12f), + (float2)(0x1.14c000p+0f, 0x1.4b9b66p-11f), + (float2)(0x1.154000p+0f, 0x1.d81e66p-11f), + (float2)(0x1.160000p+0f, 0x1.7f825cp-13f), + (float2)(0x1.168000p+0f, 0x1.c5dca2p-12f), + (float2)(0x1.170000p+0f, 0x1.6153bap-11f), + (float2)(0x1.178000p+0f, 0x1.db1cc2p-11f), + (float2)(0x1.184000p+0f, 0x1.4154b0p-13f), + (float2)(0x1.18c000p+0f, 0x1.821114p-12f), + (float2)(0x1.194000p+0f, 0x1.2d4240p-11f), + (float2)(0x1.19c000p+0f, 0x1.950d82p-11f), + (float2)(0x1.1a4000p+0f, 0x1.f8755cp-11f), + (float2)(0x1.1b0000p+0f, 0x1.5e12a4p-13f), + (float2)(0x1.1b8000p+0f, 0x1.648c38p-12f), + (float2)(0x1.1c0000p+0f, 0x1.08c43ep-11f), + (float2)(0x1.1c8000p+0f, 0x1.5b0970p-11f), + (float2)(0x1.1d0000p+0f, 0x1.a91fe8p-11f), + (float2)(0x1.1d8000p+0f, 0x1.f311b6p-11f), + (float2)(0x1.1e4000p+0f, 0x1.c74618p-14f), + (float2)(0x1.1ec000p+0f, 0x1.eabb54p-13f), + (float2)(0x1.1f4000p+0f, 0x1.70db14p-12f), + (float2)(0x1.1fc000p+0f, 0x1.e45cbcp-12f), + (float2)(0x1.204000p+0f, 0x1.27faa6p-11f), + (float2)(0x1.20c000p+0f, 0x1.59db98p-11f), + (float2)(0x1.214000p+0f, 0x1.87da46p-11f), + (float2)(0x1.21c000p+0f, 0x1.b1ffa0p-11f), + (float2)(0x1.224000p+0f, 0x1.d85478p-11f), + (float2)(0x1.22c000p+0f, 0x1.fae17ep-11f), + (float2)(0x1.238000p+0f, 0x1.9af40cp-15f), + (float2)(0x1.240000p+0f, 0x1.a6319ep-14f), + (float2)(0x1.248000p+0f, 0x1.30baa6p-13f), + (float2)(0x1.250000p+0f, 0x1.7fc362p-13f), + (float2)(0x1.258000p+0f, 0x1.c05362p-13f), + (float2)(0x1.260000p+0f, 0x1.f28a98p-13f), + (float2)(0x1.268000p+0f, 0x1.0b4442p-12f), + (float2)(0x1.270000p+0f, 0x1.16361ap-12f), + (float2)(0x1.278000p+0f, 0x1.1a2a2ap-12f), + (float2)(0x1.280000p+0f, 0x1.172f8ep-12f), + (float2)(0x1.288000p+0f, 0x1.0d5530p-12f), + (float2)(0x1.290000p+0f, 0x1.f9538ep-13f), + (float2)(0x1.298000p+0f, 0x1.ca77b0p-13f), + (float2)(0x1.2a0000p+0f, 0x1.8e336ap-13f), + (float2)(0x1.2a8000p+0f, 0x1.44a304p-13f), + (float2)(0x1.2b0000p+0f, 0x1.dbc4c8p-14f), + (float2)(0x1.2b8000p+0f, 0x1.141a2ap-14f), + (float2)(0x1.2c0000p+0f, 0x1.93e44cp-17f), + (float2)(0x1.2c4000p+0f, 0x1.e6e432p-11f), + (float2)(0x1.2cc000p+0f, 0x1.c447c6p-11f), + (float2)(0x1.2d4000p+0f, 0x1.9e80d8p-11f), + (float2)(0x1.2dc000p+0f, 0x1.7595dcp-11f), + (float2)(0x1.2e4000p+0f, 0x1.498d30p-11f), + (float2)(0x1.2ec000p+0f, 0x1.1a6d1ep-11f), + (float2)(0x1.2f4000p+0f, 0x1.d077bap-12f), + (float2)(0x1.2fc000p+0f, 0x1.65ff1ep-12f), + (float2)(0x1.304000p+0f, 0x1.eaf912p-13f), + (float2)(0x1.30c000p+0f, 0x1.fbefb8p-14f), + (float2)(0x1.314000p+0f, 0x1.44905ap-19f), + (float2)(0x1.318000p+0f, 0x1.c017e6p-11f), + (float2)(0x1.320000p+0f, 0x1.7bfdbep-11f), + (float2)(0x1.328000p+0f, 0x1.34fbc6p-11f), + (float2)(0x1.330000p+0f, 0x1.d62f48p-12f), + (float2)(0x1.338000p+0f, 0x1.3cadc6p-12f), + (float2)(0x1.340000p+0f, 0x1.3afc06p-13f), + (float2)(0x1.344000p+0f, 0x1.fc556ep-11f), + (float2)(0x1.34c000p+0f, 0x1.a71f84p-11f), + (float2)(0x1.354000p+0f, 0x1.4f2290p-11f), + (float2)(0x1.35c000p+0f, 0x1.e8c79cp-12f), + (float2)(0x1.364000p+0f, 0x1.2dd0d8p-12f), + (float2)(0x1.36c000p+0f, 0x1.b5ac2ep-14f), + (float2)(0x1.370000p+0f, 0x1.d3d02ap-11f), + (float2)(0x1.378000p+0f, 0x1.6e3d58p-11f), + (float2)(0x1.380000p+0f, 0x1.060200p-11f), + (float2)(0x1.388000p+0f, 0x1.364608p-12f), + (float2)(0x1.390000p+0f, 0x1.6d29b6p-14f), + (float2)(0x1.394000p+0f, 0x1.bd8d5ep-11f), + (float2)(0x1.39c000p+0f, 0x1.4ae030p-11f), + (float2)(0x1.3a4000p+0f, 0x1.ab44b2p-12f), + (float2)(0x1.3ac000p+0f, 0x1.7761cep-13f), + (float2)(0x1.3b0000p+0f, 0x1.e38710p-11f), + (float2)(0x1.3b8000p+0f, 0x1.66b2b0p-11f), + (float2)(0x1.3c0000p+0f, 0x1.cebf96p-12f), + (float2)(0x1.3c8000p+0f, 0x1.964b20p-13f), + (float2)(0x1.3cc000p+0f, 0x1.e15004p-11f), + (float2)(0x1.3d4000p+0f, 0x1.5a9bcep-11f), + (float2)(0x1.3dc000p+0f, 0x1.a2f4d8p-12f), + (float2)(0x1.3e4000p+0f, 0x1.17c056p-13f), + (float2)(0x1.3e8000p+0f, 0x1.b800f8p-11f), + (float2)(0x1.3f0000p+0f, 0x1.27b132p-11f), + (float2)(0x1.3f8000p+0f, 0x1.2a09b8p-12f), + (float2)(0x1.400000p+0f, 0x0.000000p+0f), + (float2)(0x1.404000p+0f, 0x1.68a69cp-11f), + (float2)(0x1.40c000p+0f, 0x1.9df950p-12f), + (float2)(0x1.414000p+0f, 0x1.983050p-14f), + (float2)(0x1.418000p+0f, 0x1.94c6a4p-11f), + (float2)(0x1.420000p+0f, 0x1.e88494p-12f), + (float2)(0x1.428000p+0f, 0x1.45f31ap-13f), +) +
diff --git a/amd-builtins/math32/ceilF.cl b/amd-builtins/math32/ceilF.cl new file mode 100644 index 0000000..137ebe1 --- /dev/null +++ b/amd-builtins/math32/ceilF.cl
@@ -0,0 +1,29 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +ceil(float x) +{ + return __amdil_round_posinf_f32(x); +}
diff --git a/amd-builtins/math32/copysignF.cl b/amd-builtins/math32/copysignF.cl new file mode 100644 index 0000000..c60cbaf --- /dev/null +++ b/amd-builtins/math32/copysignF.cl
@@ -0,0 +1,33 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +// __hsail_ intrinsic which has no __amdil_ equivalent. +extern __attribute__((pure)) float __hsail_copysign_f32(float, float); + +__attribute__((overloadable, always_inline)) float +copysign(float x, float y) +{ + return __hsail_copysign_f32(x, y); +} +
diff --git a/amd-builtins/math32/cosF.cl b/amd-builtins/math32/cosF.cl new file mode 100644 index 0000000..f5431f0 --- /dev/null +++ b/amd-builtins/math32/cosF.cl
@@ -0,0 +1,56 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if 1 +#include "math32.h" +#include "remainderF_piby2.h" +#include "sincosF_piby4.h" +//#else +//extern __attribute__((pure)) float __amdil_cos_f32(float); +#endif + +__attribute__((overloadable, pure)) float +cos(float x) +{ +#if 1 + int ix = as_int(x); + int ax = ix & 0x7fffffff; + float dx = as_float(ax); + + float r0, r1; + int regn = argReductionS(&r0, &r1, dx); + + float ss = -sinf_piby4_new(r0, r1); + float cc = cosf_piby4_new(r0, r1); + + float c = (regn & 1) != 0 ? ss : cc; + c = as_float(as_int(c) ^ ((regn > 1) << 31)); + + c = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : c; + + return c; +#else + // TODO_HSA: Using native_cos for now. + return native_cos(x); +#endif +} +
diff --git a/amd-builtins/math32/coshF.cl b/amd-builtins/math32/coshF.cl new file mode 100644 index 0000000..244bae1 --- /dev/null +++ b/amd-builtins/math32/coshF.cl
@@ -0,0 +1,100 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +cosh(float x) +{ + // After dealing with special cases the computation is split into regions as follows. + // abs(x) >= max_cosh_arg: + // cosh(x) = sign(x)*Inf + // abs(x) >= small_threshold: + // cosh(x) = sign(x)*exp(abs(x))/2 computed using the + // splitexp and scaleDouble functions as for exp_amd(). + // abs(x) < small_threshold: + // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + // cosh(x) is then z. + + // Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. + USE_TABLE(float2, p_tbl, SINHCOSH_TBL); + + const float max_cosh_arg = 0x1.65a9fap+6f; + const float small_threshold = 0x1.0a2b24p+3f; + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + float y = as_float(aux); + + // Find the integer part y0 of y and the increment dy = y - y0. We then compute + // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) + // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy) + // where sinh(y0) and cosh(y0) are tabulated above. + + int ind = (int)y; + ind = (uint)ind > 36U ? 0 : ind; + + float dy = y - ind; + float dy2 = dy * dy; + + float sdy = mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, 0.7746188980094184251527126e-12f, 0.160576793121939886190847e-9f), + 0.250521176994133472333666e-7f), + 0.275573191913636406057211e-5f), + 0.198412698413242405162014e-3f), + 0.833333333333329931873097e-2f), + 0.166666666666666667013899e0f); + sdy = mad(sdy, dy*dy2, dy); + + float cdy = mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, 0.1163921388172173692062032e-10f, 0.208744349831471353536305e-8f), + 0.275573350756016588011357e-6f), + 0.248015872460622433115785e-4f), + 0.138888888889814854814536e-2f), + 0.416666666666660876512776e-1f), + 0.500000000000000005911074e0f); + cdy = mad(cdy, dy2, 1.0f); + + float2 tv = p_tbl[ind]; + float z = mad(tv.s0, sdy, tv.s1 * cdy); + + // When exp(-x) is insignificant compared to exp(x), return exp(x)/2 + float t = exp(y - 0x1.62e500p-1f); + float zsmall = mad(0x1.a0210ep-18f, t, t); + z = y >= small_threshold ? zsmall : z; + + // Corner cases + z = y >= max_cosh_arg ? as_float(PINFBITPATT_SP32) : z; + z = aux > PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : z; + z = aux < 0x38800000 ? 1.0f : z; + + return z; +} +
diff --git a/amd-builtins/math32/cospiF.cl b/amd-builtins/math32/cospiF.cl new file mode 100644 index 0000000..2ed79ab --- /dev/null +++ b/amd-builtins/math32/cospiF.cl
@@ -0,0 +1,75 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" +#include "sincospiF_piby4.h" + +__attribute__((overloadable)) float +cospi(float x) +{ + const float pi = 3.1415926535897932F; + + int ix = as_int(x) & 0x7fffffff; + float ax = as_float(ix); + int iax = (int)ax; + float r = ax - iax; + int xodd = iax & 0x1 ? 0x80000000 : 0; + + // Initialize with return for +-Inf and NaN + int ir = 0x7fc00000; + + // 2^24 <= |x| < Inf, the result is always even integer + ir = ix < 0x7f800000 ? 0x3f800000 : ir; + + // 2^23 <= |x| < 2^24, the result is always integer + ir = ix < 0x4b800000 ? xodd | 0x3f800000 : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + float a = 1.0f - r; + int e = 1; + int s = xodd ^ 0x80000000; + + // r <= 0.75 + int c = r <= 0.75f; + a = c ? r - 0.5f : a; + e = c ? 0 : e; + + // r < 0.5 + c = r < 0.5f; + a = c ? 0.5f - r : a; + s = c ? xodd : s; + + // r <= 0.25 + c = r <= 0.25f; + a = c ? r : a; + e = c ? 1 : e; + + float2 t = sincosf_piby4(a * pi); + int jr = s ^ as_int(e ? t.hi : t.lo); + + ir = ix < 0x4b000000 ? jr : ir; + + return as_float(ir); +} +
diff --git a/amd-builtins/math32/erfF.cl b/amd-builtins/math32/erfF.cl new file mode 100644 index 0000000..94e372b --- /dev/null +++ b/amd-builtins/math32/erfF.cl
@@ -0,0 +1,183 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" +#if !defined(SUBNORMALS_SUPPORTED) +#include "floattointconversion.h" +#endif //SUBNORMALS_SUPPORTED + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== +*/ + +#define erx 8.4506291151e-01f /* 0x3f58560b */ + +// Coefficients for approximation to erf on [00.84375] + +#define efx 1.2837916613e-01f /* 0x3e0375d4 */ +#define efx8 1.0270333290e+00f /* 0x3f8375d4 */ + +#define pp0 1.2837916613e-01f /* 0x3e0375d4 */ +#define pp1 -3.2504209876e-01f /* 0xbea66beb */ +#define pp2 -2.8481749818e-02f /* 0xbce9528f */ +#define pp3 -5.7702702470e-03f /* 0xbbbd1489 */ +#define pp4 -2.3763017452e-05f /* 0xb7c756b1 */ +#define qq1 3.9791721106e-01f /* 0x3ecbbbce */ +#define qq2 6.5022252500e-02f /* 0x3d852a63 */ +#define qq3 5.0813062117e-03f /* 0x3ba68116 */ +#define qq4 1.3249473704e-04f /* 0x390aee49 */ +#define qq5 -3.9602282413e-06f /* 0xb684e21a */ + +// Coefficients for approximation to erf in [0.843751.25] + +#define pa0 -2.3621185683e-03f /* 0xbb1acdc6 */ +#define pa1 4.1485610604e-01f /* 0x3ed46805 */ +#define pa2 -3.7220788002e-01f /* 0xbebe9208 */ +#define pa3 3.1834661961e-01f /* 0x3ea2fe54 */ +#define pa4 -1.1089469492e-01f /* 0xbde31cc2 */ +#define pa5 3.5478305072e-02f /* 0x3d1151b3 */ +#define pa6 -2.1663755178e-03f /* 0xbb0df9c0 */ +#define qa1 1.0642088205e-01f /* 0x3dd9f331 */ +#define qa2 5.4039794207e-01f /* 0x3f0a5785 */ +#define qa3 7.1828655899e-02f /* 0x3d931ae7 */ +#define qa4 1.2617121637e-01f /* 0x3e013307 */ +#define qa5 1.3637083583e-02f /* 0x3c5f6e13 */ +#define qa6 1.1984500103e-02f /* 0x3c445aa3 */ + +// Coefficients for approximation to erfc in [1.251/0.35] + +#define ra0 -9.8649440333e-03f /* 0xbc21a093 */ +#define ra1 -6.9385856390e-01f /* 0xbf31a0b7 */ +#define ra2 -1.0558626175e+01f /* 0xc128f022 */ +#define ra3 -6.2375331879e+01f /* 0xc2798057 */ +#define ra4 -1.6239666748e+02f /* 0xc322658c */ +#define ra5 -1.8460508728e+02f /* 0xc3389ae7 */ +#define ra6 -8.1287437439e+01f /* 0xc2a2932b */ +#define ra7 -9.8143291473e+00f /* 0xc11d077e */ +#define sa1 1.9651271820e+01f /* 0x419d35ce */ +#define sa2 1.3765776062e+02f /* 0x4309a863 */ +#define sa3 4.3456588745e+02f /* 0x43d9486f */ +#define sa4 6.4538726807e+02f /* 0x442158c9 */ +#define sa5 4.2900814819e+02f /* 0x43d6810b */ +#define sa6 1.0863500214e+02f /* 0x42d9451f */ +#define sa7 6.5702495575e+00f /* 0x40d23f7c */ +#define sa8 -6.0424413532e-02f /* 0xbd777f97 */ + +// Coefficients for approximation to erfc in [1/.3528] + +#define rb0 -9.8649431020e-03f /* 0xbc21a092 */ +#define rb1 -7.9928326607e-01f /* 0xbf4c9dd4 */ +#define rb2 -1.7757955551e+01f /* 0xc18e104b */ +#define rb3 -1.6063638306e+02f /* 0xc320a2ea */ +#define rb4 -6.3756646729e+02f /* 0xc41f6441 */ +#define rb5 -1.0250950928e+03f /* 0xc480230b */ +#define rb6 -4.8351919556e+02f /* 0xc3f1c275 */ +#define sb1 3.0338060379e+01f /* 0x41f2b459 */ +#define sb2 3.2579251099e+02f /* 0x43a2e571 */ +#define sb3 1.5367296143e+03f /* 0x44c01759 */ +#define sb4 3.1998581543e+03f /* 0x4547fdbb */ +#define sb5 2.5530502930e+03f /* 0x451f90ce */ +#define sb6 4.7452853394e+02f /* 0x43ed43a7 */ +#define sb7 -2.2440952301e+01f /* 0xc1b38712 */ + +__attribute__((overloadable)) float +erf(float x) +{ + + int hx = as_uint(x); + int ix = hx & 0x7fffffff; + float absx = as_float(ix); + + float x2 = absx * absx; + float t = 1.0f / x2; + float tt = absx - 1.0f; + t = absx < 1.25f ? tt : t; + t = absx < 0.84375f ? x2 : t; + + float u, v, tu, tv; + + // |x| < 6 + u = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, rb6, rb5), rb4), rb3), rb2), rb1), rb0); + v = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sb7, sb6), sb5), sb4), sb3), sb2), sb1); + + tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0); + tv = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1); + u = absx < 0x1.6db6dcp+1f ? tu : u; + v = absx < 0x1.6db6dcp+1f ? tv : v; + + tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, pa6, pa5), pa4), pa3), pa2), pa1), pa0); + tv = mad(t, mad(t, mad(t, mad(t, mad(t, qa6, qa5), qa4), qa3), qa2), qa1); + u = absx < 1.25f ? tu : u; + v = absx < 1.25f ? tv : v; + + tu = mad(t, mad(t, mad(t, mad(t, pp4, pp3), pp2), pp1), pp0); + tv = mad(t, mad(t, mad(t, mad(t, qq5, qq4), qq3), qq2), qq1); + u = absx < 0.84375f ? tu : u; + v = absx < 0.84375f ? tv : v; + + v = mad(t, v, 1.0f); + float q = MATH_DIVIDE(u, v); + + float ret = 1.0f; + + // |x| < 6 + float z = as_float(ix & 0xfffff000); + float r = exp(mad(-z, z, -0.5625f)) * exp(mad(z-absx, z+absx, q)); + r = 1.0f - MATH_DIVIDE(r, absx); + ret = absx < 6.0f ? r : ret; + + r = erx + q; + ret = absx < 1.25f ? r : ret; + + ret = as_float((hx & 0x80000000) | as_int(ret)); + + r = mad(x, q, x); + ret = absx < 0.84375f ? r : ret; + + // Prevent underflow + r = 0.125f * mad(8.0f, x, efx8 * x); + ret = absx < 0x1.0p-28f ? r : ret; + + #if !defined(SUBNORMALS_SUPPORTED) + + double dx = float_uint_to_double(hx); + const double sqt4overpi = 1.1283791670955125738961589031215; + float ret1 = as_float(double_to_float_uint(sqt4overpi * dx)); + int c = as_uint(absx) == 0; + float ret2 = hx == 0 ? 0 : -0; + ret1 = c ? ret2 : ret1; + ret = x == 0. ? ret1 : ret; + #endif //SUBNORMALS_SUPPORTED + + + ret = isnan(x) ? x : ret; + + return ret; +} +
diff --git a/amd-builtins/math32/erfcF.cl b/amd-builtins/math32/erfcF.cl new file mode 100644 index 0000000..3081785 --- /dev/null +++ b/amd-builtins/math32/erfcF.cl
@@ -0,0 +1,168 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== +*/ + +#define erx 8.4506291151e-01f /* 0x3f58560b */ + +// Coefficients for approximation to erf on [00.84375] + +#define efx 1.2837916613e-01f /* 0x3e0375d4 */ +#define efx8 1.0270333290e+00f /* 0x3f8375d4 */ + +#define pp0 1.2837916613e-01f /* 0x3e0375d4 */ +#define pp1 -3.2504209876e-01f /* 0xbea66beb */ +#define pp2 -2.8481749818e-02f /* 0xbce9528f */ +#define pp3 -5.7702702470e-03f /* 0xbbbd1489 */ +#define pp4 -2.3763017452e-05f /* 0xb7c756b1 */ +#define qq1 3.9791721106e-01f /* 0x3ecbbbce */ +#define qq2 6.5022252500e-02f /* 0x3d852a63 */ +#define qq3 5.0813062117e-03f /* 0x3ba68116 */ +#define qq4 1.3249473704e-04f /* 0x390aee49 */ +#define qq5 -3.9602282413e-06f /* 0xb684e21a */ + +// Coefficients for approximation to erf in [0.843751.25] + +#define pa0 -2.3621185683e-03f /* 0xbb1acdc6 */ +#define pa1 4.1485610604e-01f /* 0x3ed46805 */ +#define pa2 -3.7220788002e-01f /* 0xbebe9208 */ +#define pa3 3.1834661961e-01f /* 0x3ea2fe54 */ +#define pa4 -1.1089469492e-01f /* 0xbde31cc2 */ +#define pa5 3.5478305072e-02f /* 0x3d1151b3 */ +#define pa6 -2.1663755178e-03f /* 0xbb0df9c0 */ +#define qa1 1.0642088205e-01f /* 0x3dd9f331 */ +#define qa2 5.4039794207e-01f /* 0x3f0a5785 */ +#define qa3 7.1828655899e-02f /* 0x3d931ae7 */ +#define qa4 1.2617121637e-01f /* 0x3e013307 */ +#define qa5 1.3637083583e-02f /* 0x3c5f6e13 */ +#define qa6 1.1984500103e-02f /* 0x3c445aa3 */ + +// Coefficients for approximation to erfc in [1.251/0.35] + +#define ra0 -9.8649440333e-03f /* 0xbc21a093 */ +#define ra1 -6.9385856390e-01f /* 0xbf31a0b7 */ +#define ra2 -1.0558626175e+01f /* 0xc128f022 */ +#define ra3 -6.2375331879e+01f /* 0xc2798057 */ +#define ra4 -1.6239666748e+02f /* 0xc322658c */ +#define ra5 -1.8460508728e+02f /* 0xc3389ae7 */ +#define ra6 -8.1287437439e+01f /* 0xc2a2932b */ +#define ra7 -9.8143291473e+00f /* 0xc11d077e */ +#define sa1 1.9651271820e+01f /* 0x419d35ce */ +#define sa2 1.3765776062e+02f /* 0x4309a863 */ +#define sa3 4.3456588745e+02f /* 0x43d9486f */ +#define sa4 6.4538726807e+02f /* 0x442158c9 */ +#define sa5 4.2900814819e+02f /* 0x43d6810b */ +#define sa6 1.0863500214e+02f /* 0x42d9451f */ +#define sa7 6.5702495575e+00f /* 0x40d23f7c */ +#define sa8 -6.0424413532e-02f /* 0xbd777f97 */ + +// Coefficients for approximation to erfc in [1/.3528] + +#define rb0 -9.8649431020e-03f /* 0xbc21a092 */ +#define rb1 -7.9928326607e-01f /* 0xbf4c9dd4 */ +#define rb2 -1.7757955551e+01f /* 0xc18e104b */ +#define rb3 -1.6063638306e+02f /* 0xc320a2ea */ +#define rb4 -6.3756646729e+02f /* 0xc41f6441 */ +#define rb5 -1.0250950928e+03f /* 0xc480230b */ +#define rb6 -4.8351919556e+02f /* 0xc3f1c275 */ +#define sb1 3.0338060379e+01f /* 0x41f2b459 */ +#define sb2 3.2579251099e+02f /* 0x43a2e571 */ +#define sb3 1.5367296143e+03f /* 0x44c01759 */ +#define sb4 3.1998581543e+03f /* 0x4547fdbb */ +#define sb5 2.5530502930e+03f /* 0x451f90ce */ +#define sb6 4.7452853394e+02f /* 0x43ed43a7 */ +#define sb7 -2.2440952301e+01f /* 0xc1b38712 */ + +__attribute__((overloadable)) float +erfc(float x) +{ + int hx = as_int(x); + int ix = hx & 0x7fffffff; + float absx = as_float(ix); + + // Argument for polys + float x2 = absx * absx; + float t = 1.0f / x2; + float tt = absx - 1.0f; + t = absx < 1.25f ? tt : t; + t = absx < 0.84375f ? x2 : t; + + // Evaluate polys + float tu, tv, u, v; + + u = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, rb6, rb5), rb4), rb3), rb2), rb1), rb0); + v = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sb7, sb6), sb5), sb4), sb3), sb2), sb1); + + tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0); + tv = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1); + u = absx < 0x1.6db6dap+1f ? tu : u; + v = absx < 0x1.6db6dap+1f ? tv : v; + + tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, pa6, pa5), pa4), pa3), pa2), pa1), pa0); + tv = mad(t, mad(t, mad(t, mad(t, mad(t, qa6, qa5), qa4), qa3), qa2), qa1); + u = absx < 1.25f ? tu : u; + v = absx < 1.25f ? tv : v; + + tu = mad(t, mad(t, mad(t, mad(t, pp4, pp3), pp2), pp1), pp0); + tv = mad(t, mad(t, mad(t, mad(t, qq5, qq4), qq3), qq2), qq1); + u = absx < 0.84375f ? tu : u; + v = absx < 0.84375f ? tv : v; + + v = mad(t, v, 1.0f); + + float q = MATH_DIVIDE(u, v); + + float ret = 0.0f; + + float z = as_float(ix & 0xfffff000); + float r = exp(mad(-z, z, -0.5625f)) * exp(mad(z - absx, z + absx, q)); + r = MATH_DIVIDE(r, absx); + t = 2.0f - r; + r = x < 0.0f ? t : r; + ret = absx < 28.0f ? r : ret; + + r = 1.0f - erx - q; + t = erx + q + 1.0f; + r = x < 0.0f ? t : r; + ret = absx < 1.25f ? r : ret; + + r = 0.5f - mad(x, q, x - 0.5f); + ret = absx < 0.84375f ? r : ret; + + ret = x < -6.0f ? 2.0f : ret; + + ret = isnan(x) ? x : ret; + + return ret; +} +
diff --git a/amd-builtins/math32/exp10F.cl b/amd-builtins/math32/exp10F.cl new file mode 100644 index 0000000..3541a68 --- /dev/null +++ b/amd-builtins/math32/exp10F.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_EXP10 +#include "expF_base.h" +
diff --git a/amd-builtins/math32/exp2F.cl b/amd-builtins/math32/exp2F.cl new file mode 100644 index 0000000..5086eb7 --- /dev/null +++ b/amd-builtins/math32/exp2F.cl
@@ -0,0 +1,71 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if 0 + +#define COMPILING_EXP2 +#include "expF_base.h" + +#else + +#include "math32.h" + +__attribute__((overloadable, weak)) float +exp2(float x) +{ + // Reduce x + const float ln2HI = 0x1.62e300p-1f; + const float ln2LO = 0x1.2fefa2p-17f; + + float t = rint(x); + int p = (int)t; + float tt = x - t; + float hi = tt * ln2HI; + float lo = tt * ln2LO; + + // Evaluate poly + t = hi + lo; + tt = t*t; + float v = mad(tt, + -mad(tt, + mad(tt, + mad(tt, + mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f), + 0x1.1566aap-14f), + -0x1.6c16c2p-9f), + 0x1.555556p-3f), + t); + + float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi); + + // Scale by 2^p + float r = as_float(as_int(y) + (p << 23)); + + const float ulim = 128.0f; + const float llim = -126.0f; + + r = x < llim ? 0.0f : r; + r = x < ulim ? r : as_float(0x7f800000); + return isnan(x) ? x : r; +} + +#endif
diff --git a/amd-builtins/math32/expF.cl b/amd-builtins/math32/expF.cl new file mode 100644 index 0000000..0975dc4 --- /dev/null +++ b/amd-builtins/math32/expF.cl
@@ -0,0 +1,73 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if 0 + +#define COMPILING_EXP +#include "expF_base.h" + +#else + +#include "math32.h" + +__attribute__((overloadable, weak)) float +exp(float x) +{ + // Reduce x + const float ln2HI = 0x1.62e300p-1f; + const float ln2LO = 0x1.2fefa2p-17f; + const float invln2 = 0x1.715476p+0f; + + float fhalF = x < 0.0f ? -0.5f : 0.5f; + int p = mad(x, invln2, fhalF); + float fp = (float)p; + float hi = mad(fp, -ln2HI, x); // t*ln2HI is exact here + float lo = -fp*ln2LO; + + // Evaluate poly + float t = hi + lo; + float tt = t*t; + float v = mad(tt, + -mad(tt, + mad(tt, + mad(tt, + mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f), + 0x1.1566aap-14f), + -0x1.6c16c2p-9f), + 0x1.555556p-3f), + t); + + float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi); + + // Scale by 2^p + float r = as_float(as_int(y) + (p << 23)); + + const float ulim = 0x1.62e430p+6f; // ln(largest_normal) = 88.72283905206835305366 + const float llim = -0x1.5d589ep+6f; // ln(smallest_normal) = -87.33654475055310898657 + + r = x < llim ? 0.0f : r; + r = x < ulim ? r : as_float(0x7f800000); + return isnan(x) ? x : r; +} + +#endif +
diff --git a/amd-builtins/math32/expF_base.h b/amd-builtins/math32/expF_base.h new file mode 100644 index 0000000..8cf9ad4 --- /dev/null +++ b/amd-builtins/math32/expF_base.h
@@ -0,0 +1,129 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +// Algorithm: +// +// e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) +// +// x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer +// n = 64*m + j, 0 <= j < 64 +// +// e^x = 2^((64*m + j + f)/64) +// = (2^m) * (2^(j/64)) * 2^(f/64) +// = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) +// +// f = x*(64/ln(2)) - n +// r = f*(ln(2)/64) = x - n*(ln(2)/64) +// +// e^x = (2^m) * (2^(j/64)) * e^r +// +// (2^(j/64)) is precomputed +// +// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! +// e^r = 1 + q +// +// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! +// +// e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) + +__attribute__((overloadable, weak)) float +#if defined(COMPILING_EXP2) +exp2(float x) +#elif defined(COMPILING_EXP10) +exp10(float x) +#else +exp(float x) +#endif +{ + USE_TABLE(float, p_tbl, EXP_TBL); + +#if defined(COMPILING_EXP2) + const float X_MAX = 0x1.fffffep+6f; // 128 + const float X_MIN = -0x1.2a0000p+7f; // -149 +#elif defined(COMPILING_EXP10) + const float X_MAX = 0x1.344134p+5f; // 128*log2/log10 : 38.53183944498959 + const float X_MIN = -0x1.66d3e8p+5f; // -149*log2/log10 : -44.8534693539332 +#else + const float X_MAX = 0x1.62e42ep+6f; // 128*log2 : 88.722839111673 + const float X_MIN = -0x1.9d1da0p+6f; // -149*log2 : -103.27892990343184 +#endif + +#if defined(COMPILING_EXP2) + const float R_64 = 0x1.000000p+6f; // 2^6 + const float R_1_BY_64 = 0x1.000000p-6f; // 2^-6 + const float R_LN2 = 0x1.62e430p-1f; // 0.6931471805599453 +#elif defined(COMPILING_EXP10) + const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f; // 64*log10/log2 : 212.6033980727912 + const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f; // log2/(64 * log10) lead : 0.004699707 + const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f; // log2/(64 * log10) tail : 0.00000388665057 + const float R_LN10 = 0x1.26bb1cp+1f; +#else + const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657 + const float R_LOG2_BY_64_LD = 0x1.620000p-7f; /* log2/64 lead: 0.0108032227 */ + const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; /* log2/64 tail: 0.0000272020388 */ +#endif + + int return_nan = isnan(x); + int return_inf = x > X_MAX; + int return_zero = x < X_MIN; + +#if defined(COMPILING_EXP2) + int n = convert_int(x * R_64); +#elif defined(COMPILING_EXP10) + int n = convert_int(x * R_64_BY_LOG10_2); +#else + int n = convert_int(x * R_64_BY_LOG2); +#endif + + float fn = (float)n; + int j = n & 0x3f; + int m = n >> 6; + int m2 = m << EXPSHIFTBITS_SP32; + float r; + +#if defined(COMPILING_EXP2) + r = R_LN2 * mad(-R_1_BY_64, fn, x); +#elif defined(COMPILING_EXP10) + r = R_LN10 * mad(fn, -R_LOG10_2_BY_64_TL, mad(fn, -R_LOG10_2_BY_64_LD, x)); +#else + r = mad(fn, -R_LOG2_BY_64_TL, mad(fn, -R_LOG2_BY_64_LD, x)); +#endif + + // Truncated Taylor series for e^r + float z2 = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r); + + float two_to_jby64 = p_tbl[j]; + z2 = mad(two_to_jby64, z2, two_to_jby64); + + float z2s = z2 * as_float(0x1 << (m + 149)); + float z2n = as_float(as_int(z2) + m2); + z2 = m <= -126 ? z2s : z2n; + + + z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2; + z2 = return_zero ? 0.0f : z2; + z2 = return_nan ? x : z2; + return z2; +} +
diff --git a/amd-builtins/math32/expF_table.h b/amd-builtins/math32/expF_table.h new file mode 100644 index 0000000..3d6759f --- /dev/null +++ b/amd-builtins/math32/expF_table.h
@@ -0,0 +1,158 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +DECLARE_TABLE(float, EXP_TBL, 65, + 0x1.000000p+0f, + 0x1.02c9a4p+0f, + 0x1.059b0ep+0f, + 0x1.087452p+0f, + 0x1.0b5586p+0f, + 0x1.0e3ec4p+0f, + 0x1.11301ep+0f, + 0x1.1429aap+0f, + 0x1.172b84p+0f, + 0x1.1a35bep+0f, + 0x1.1d4874p+0f, + 0x1.2063b8p+0f, + 0x1.2387a6p+0f, + 0x1.26b456p+0f, + 0x1.29e9e0p+0f, + 0x1.2d285ap+0f, + 0x1.306fe0p+0f, + 0x1.33c08cp+0f, + 0x1.371a74p+0f, + 0x1.3a7db4p+0f, + 0x1.3dea64p+0f, + 0x1.4160a2p+0f, + 0x1.44e086p+0f, + 0x1.486a2cp+0f, + 0x1.4bfdaep+0f, + 0x1.4f9b28p+0f, + 0x1.5342b6p+0f, + 0x1.56f474p+0f, + 0x1.5ab07ep+0f, + 0x1.5e76f2p+0f, + 0x1.6247ecp+0f, + 0x1.662388p+0f, + 0x1.6a09e6p+0f, + 0x1.6dfb24p+0f, + 0x1.71f75ep+0f, + 0x1.75feb6p+0f, + 0x1.7a1148p+0f, + 0x1.7e2f34p+0f, + 0x1.82589ap+0f, + 0x1.868d9ap+0f, + 0x1.8ace54p+0f, + 0x1.8f1aeap+0f, + 0x1.93737cp+0f, + 0x1.97d82ap+0f, + 0x1.9c4918p+0f, + 0x1.a0c668p+0f, + 0x1.a5503cp+0f, + 0x1.a9e6b6p+0f, + 0x1.ae89fap+0f, + 0x1.b33a2cp+0f, + 0x1.b7f770p+0f, + 0x1.bcc1eap+0f, + 0x1.c199bep+0f, + 0x1.c67f12p+0f, + 0x1.cb720ep+0f, + 0x1.d072d4p+0f, + 0x1.d5818ep+0f, + 0x1.da9e60p+0f, + 0x1.dfc974p+0f, + 0x1.e502eep+0f, + 0x1.ea4afap+0f, + 0x1.efa1bep+0f, + 0x1.f50766p+0f, + 0x1.fa7c18p+0f, + 0x1.000000p+1f, +) + +DECLARE_TABLE(float2, EXP_TBL_EP, 65, + (float2)(0x1.000000p+0f, 0x0.000000p+0f), + (float2)(0x1.02c000p+0f, 0x1.347ceep-13f), + (float2)(0x1.058000p+0f, 0x1.b0d314p-12f), + (float2)(0x1.084000p+0f, 0x1.a28c3ap-11f), + (float2)(0x1.0b4000p+0f, 0x1.586cf8p-12f), + (float2)(0x1.0e0000p+0f, 0x1.f61968p-11f), + (float2)(0x1.110000p+0f, 0x1.80e808p-11f), + (float2)(0x1.140000p+0f, 0x1.4d5754p-11f), + (float2)(0x1.170000p+0f, 0x1.5c1e3ep-11f), + (float2)(0x1.1a0000p+0f, 0x1.adf5b6p-11f), + (float2)(0x1.1d4000p+0f, 0x1.0e62d0p-13f), + (float2)(0x1.204000p+0f, 0x1.1dc430p-11f), + (float2)(0x1.238000p+0f, 0x1.e9b9d4p-14f), + (float2)(0x1.268000p+0f, 0x1.a2b2f0p-11f), + (float2)(0x1.29c000p+0f, 0x1.4efa8ep-11f), + (float2)(0x1.2d0000p+0f, 0x1.42d372p-11f), + (float2)(0x1.304000p+0f, 0x1.7f0518p-11f), + (float2)(0x1.33c000p+0f, 0x1.164c82p-17f), + (float2)(0x1.370000p+0f, 0x1.a7373ap-12f), + (float2)(0x1.3a4000p+0f, 0x1.ed9a72p-11f), + (float2)(0x1.3dc000p+0f, 0x1.532608p-11f), + (float2)(0x1.414000p+0f, 0x1.0510fap-11f), + (float2)(0x1.44c000p+0f, 0x1.043030p-11f), + (float2)(0x1.484000p+0f, 0x1.515ae0p-11f), + (float2)(0x1.4bc000p+0f, 0x1.ed6a9ap-11f), + (float2)(0x1.4f8000p+0f, 0x1.b2769cp-12f), + (float2)(0x1.534000p+0f, 0x1.5ab4eap-15f), + (float2)(0x1.56c000p+0f, 0x1.a39b5ap-11f), + (float2)(0x1.5a8000p+0f, 0x1.83eea4p-11f), + (float2)(0x1.5e4000p+0f, 0x1.b78ad6p-11f), + (float2)(0x1.624000p+0f, 0x1.fac0e8p-14f), + (float2)(0x1.660000p+0f, 0x1.1c412ap-11f), + (float2)(0x1.6a0000p+0f, 0x1.3cccfep-13f), + (float2)(0x1.6dc000p+0f, 0x1.d91e32p-11f), + (float2)(0x1.71c000p+0f, 0x1.baf476p-11f), + (float2)(0x1.75c000p+0f, 0x1.f5ab20p-11f), + (float2)(0x1.7a0000p+0f, 0x1.1473eap-12f), + (float2)(0x1.7e0000p+0f, 0x1.799b66p-11f), + (float2)(0x1.824000p+0f, 0x1.89994cp-12f), + (float2)(0x1.868000p+0f, 0x1.b33688p-13f), + (float2)(0x1.8ac000p+0f, 0x1.ca8454p-13f), + (float2)(0x1.8f0000p+0f, 0x1.ae9914p-12f), + (float2)(0x1.934000p+0f, 0x1.9bd866p-11f), + (float2)(0x1.97c000p+0f, 0x1.829fdep-12f), + (float2)(0x1.9c4000p+0f, 0x1.230546p-13f), + (float2)(0x1.a0c000p+0f, 0x1.99ed76p-14f), + (float2)(0x1.a54000p+0f, 0x1.03b23ep-12f), + (float2)(0x1.a9c000p+0f, 0x1.35aabcp-11f), + (float2)(0x1.ae8000p+0f, 0x1.3f32b4p-13f), + (float2)(0x1.b30000p+0f, 0x1.d15c26p-11f), + (float2)(0x1.b7c000p+0f, 0x1.bb797cp-11f), + (float2)(0x1.bcc000p+0f, 0x1.e904bcp-16f), + (float2)(0x1.c18000p+0f, 0x1.9bdd84p-12f), + (float2)(0x1.c64000p+0f, 0x1.f8972ap-11f), + (float2)(0x1.cb4000p+0f, 0x1.906e76p-11f), + (float2)(0x1.d04000p+0f, 0x1.96a502p-11f), + (float2)(0x1.d58000p+0f, 0x1.8dcfbap-16f), + (float2)(0x1.da8000p+0f, 0x1.e603dap-12f), + (float2)(0x1.dfc000p+0f, 0x1.2e66f6p-13f), + (float2)(0x1.e50000p+0f, 0x1.773c58p-15f), + (float2)(0x1.ea4000p+0f, 0x1.5f4548p-13f), + (float2)(0x1.ef8000p+0f, 0x1.0df730p-11f), + (float2)(0x1.f50000p+0f, 0x1.d96db8p-14f), + (float2)(0x1.fa4000p+0f, 0x1.e0c0cep-11f), + (float2)(0x1.000000p+1f, 0x0.000000p+0f), +) +
diff --git a/amd-builtins/math32/expm1F.cl b/amd-builtins/math32/expm1F.cl new file mode 100644 index 0000000..1584280 --- /dev/null +++ b/amd-builtins/math32/expm1F.cl
@@ -0,0 +1,66 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +/* Refer exp routine for algorithm */ + +__attribute__((overloadable)) float +expm1(float x) +{ + USE_TABLE(float2, p_tbl, EXP_TBL_EP); + + const float X_MAX = 0x1.62e42ep+6f; // 128*log2 : 88.722839111673 + const float X_MIN = -0x1.9d1da0p+6f; // -149*log2 : -103.27892990343184 + + const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657 + const float R_LOG2_BY_64_LD = 0x1.620000p-7f; // log2/64 lead: 0.0108032227 + const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388 + + uint xi = as_uint(x); + int n = (int)(x * R_64_BY_LOG2); + float fn = (float)n; + + int j = n & 0x3f; + int m = n >> 6; + + float r = mad(fn, -R_LOG2_BY_64_TL, mad(fn, -R_LOG2_BY_64_LD, x)); + + // Truncated Taylor series + float z2 = mad(r*r, mad(r, mad(r, 0x1.555556p-5f, 0x1.555556p-3f), 0.5f), r); + + float m2 = as_float((m + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + float2 tv = p_tbl[j]; + float two_to_jby64_h = tv.s0 * m2; + float two_to_jby64_t = tv.s1 * m2; + float two_to_jby64 = two_to_jby64_h + two_to_jby64_t; + + z2 = mad(z2, two_to_jby64, two_to_jby64_t) + (two_to_jby64_h - 1.0f); + //Make subnormals work + z2 = x == 0. ? x : z2; + z2 = x < X_MIN | m < -24 ? -1.0f : z2; + z2 = x > X_MAX ? as_float(PINFBITPATT_SP32) : z2; + z2 = isnan(x) ? x : z2; + + return z2; +} +
diff --git a/amd-builtins/math32/fabsF.cl b/amd-builtins/math32/fabsF.cl new file mode 100644 index 0000000..f1bdaad --- /dev/null +++ b/amd-builtins/math32/fabsF.cl
@@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +fabs(float x) +{ + return __amdil_fabs_f32(x); +} +
diff --git a/amd-builtins/math32/fdimF.cl b/amd-builtins/math32/fdimF.cl new file mode 100644 index 0000000..960857f --- /dev/null +++ b/amd-builtins/math32/fdimF.cl
@@ -0,0 +1,40 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +fdim(float x, float y) +{ + int n = -(isnan(x) | isnan(y)) & QNANBITPATT_SP32; + int r = -(x > y) & as_int(x - y); + return as_float(n | r); +} + +__attribute__((overloadable, always_inline)) float4 +fdim(float4 x, float4 y) +{ + int4 n = ~((x == x) & (y == y)) & QNANBITPATT_SP32; + int4 r = (x > y) & as_int4(x - y); + return as_float4(n | r); +} +
diff --git a/amd-builtins/math32/floorF.cl b/amd-builtins/math32/floorF.cl new file mode 100644 index 0000000..38490f0 --- /dev/null +++ b/amd-builtins/math32/floorF.cl
@@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +floor(float x) +{ + return __amdil_round_neginf_f32(x); +} +
diff --git a/amd-builtins/math32/fmaF.cl b/amd-builtins/math32/fmaF.cl new file mode 100644 index 0000000..053a356 --- /dev/null +++ b/amd-builtins/math32/fmaF.cl
@@ -0,0 +1,200 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float4 +fma(float4 a, float4 b, float4 c) +{ + float4 ret; + ret.lo = fma(a.lo, b.lo, c.lo); + ret.hi = fma(a.hi, b.hi, c.hi); + return ret; +} + +__attribute__((overloadable, always_inline)) float2 +fma(float2 a, float2 b, float2 c) +{ + float2 ret; + ret.lo = fma(a.lo, b.lo, c.lo); + ret.hi = fma(a.hi, b.hi, c.hi); + return ret; +} + +#define CM(C, B, A) as_float(__amdil_cmov_logical_i32(as_uint(C), as_uint(B), as_uint(A))) + +__attribute__((overloadable, always_inline)) float +fma(float a, float b, float c) +{ + if (HAVE_HW_FMA32()) { + return __amdil_fma_f32(a, b, c); + } else { + float z3 = mad(a, b, c); + float cs = c; + + int ae = as_int(a) >> 23; + int be = as_int(b) >> 23; + int ce = as_int(c) >> 23; + + ae &= 0xff; + be &= 0xff; + ce &= 0xff; + + ae -= 127; + be -= 127; + ce -= 127; + + int pe = ae + be; + + int cen = ce - pe; + cen += 127; + cen <<= 23; + + // special cases flag + int spclal = ae == -127; + int spclbl = be == -127; + int spclcl = ce == -127; + + int spclah = ae == 128; + int spclbh = be == 128; + int spclch = ce == 128; + + spclal |= spclah; + spclbl |= spclbh; + spclcl |= spclch; + + int spcl = spclal | spclbl; + spcl |= spclcl; + + int spcl2 = spclah | spclbh; + spcl2 = ~spcl2; + spcl2 &= spclch; + + // Normalize + int an = as_int(a) & 0x807fffff; + int bn = as_int(b) & 0x807fffff; + int cn = as_int(c) & 0x807fffff; + + an |= 0x3f800000; + bn |= 0x3f800000; + cn |= cen; + + a = as_float(an); + b = as_float(bn); + c = as_float(cn); + + // Get head & tail parts of a, b + float ah = as_float(an & 0xfffff000); + float bh = as_float(bn & 0xfffff000); + + float at = a - ah; + float bt = b - bh; + + // Get head & tail parts of the product a*b + float p = a * b; + float pt = mad(ah, bh, -p); + pt = mad(ah, bt, pt); + pt = mad(at, bh, pt); + pt = mad(at, bt, pt); + + // carefully add p and c; these steps valid only when pe and ce are not far apart + float rr = p + c; + float t1 = p - rr; + t1 += c; + float t2 = c - rr; + t2 += p; + int pick1 = as_int(p) & 0x7fffffff; + int pick2 = as_int(c) & 0x7fffffff; + int pick = pick1 > pick2; + float t = CM(pick, t1, t2); + + float vv = t + pt; + float ww1 = t - vv; + ww1 += pt; + float ww2 = pt - vv; + ww2 += t; + pick1 = as_int(t) & 0x7fffffff; + pick2 = as_int(pt) & 0x7fffffff; + pick = pick1 > pick2; + float ww = CM(pick, ww1, ww2); + + // pick r,v,w based on how far apart pe and ce are + // number 60 is safe; actual value close to 24+24+2 + pick1 = pe - ce; + pick = pick1 < 60; + float r = CM(pick, rr, p); + float v = CM(pick, vv, pt); + float w = CM(pick, ww, cs); + + // identify if there was a rounding issue, and so correction is needed + int rndc1 = as_int(r) & 0x7f800000; + int rndc2 = as_int(v) & 0x7f800000; + int rndc = rndc1 - rndc2; + rndc = rndc == 0x0c000000; + rndc1 = as_int(v) & 0x007fffff; + rndc1 = rndc1 == 0; + rndc2 = as_int(w) & 0x7fffffff; + rndc2 = rndc2 != 0; + rndc &= rndc1; + rndc &= rndc2; + + int ws = as_int(w) & 0x80000000; + int ve = as_int(v) & 0x7f800000; + ve -= 0x0b800000; + w = as_float(ws | ve); + + float vw = v + w; + v = CM(rndc, vw, v); + float z = r + v; + + // reconstruct return value + int ze = as_int(z) >> 23; + ze &= 0xff; + ze -= 127; + ze += pe; + ze += 127; + + int z1e = ze & 0xff; + z1e <<= 23; + int z1 = as_int(z) & 0x807fffff; + z1 |= z1e; + + pick1 = as_int(z) & 0x7fffffff; + pick = pick1 == 0; + z = CM(pick, z, z1); + + int z2 = as_int(z) & 0x80000000; + pick = ze <= 0; + z = CM(pick, z2, z); + z2 |= 0x7f800000; + pick = ze > 254; + z = CM(pick, z2, z); + + pick1 = ce - pe; + pick = pick1 > 30; + z = CM(pick, cs, z); + z = CM(spcl, z3, z); + z = CM(spcl2, cs, z); + return z; + } +} +
diff --git a/amd-builtins/math32/fmaxF.cl b/amd-builtins/math32/fmaxF.cl new file mode 100644 index 0000000..f8b48a6 --- /dev/null +++ b/amd-builtins/math32/fmaxF.cl
@@ -0,0 +1,31 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +extern __attribute__((pure)) float __hsail_max_f32(float,float); + +__attribute__ ((overloadable, always_inline)) float +fmax(float x, float y) +{ + return __hsail_max_f32(x, y); +}
diff --git a/amd-builtins/math32/fminF.cl b/amd-builtins/math32/fminF.cl new file mode 100644 index 0000000..f78ad5b --- /dev/null +++ b/amd-builtins/math32/fminF.cl
@@ -0,0 +1,32 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +extern __attribute__((pure)) float __hsail_min_f32(float,float); + +__attribute__((overloadable, always_inline)) float +fmin(float x, float y) +{ + // The adds here force subnormal values to zero + return __hsail_min_f32(x, y); +}
diff --git a/amd-builtins/math32/fmodF.cl b/amd-builtins/math32/fmodF.cl new file mode 100644 index 0000000..3fb552c --- /dev/null +++ b/amd-builtins/math32/fmodF.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_FMOD + +#include "remainderF.h"
diff --git a/amd-builtins/math32/fractF.cl b/amd-builtins/math32/fractF.cl new file mode 100644 index 0000000..55dc3fc --- /dev/null +++ b/amd-builtins/math32/fractF.cl
@@ -0,0 +1,57 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +fract (float x, float *ip) +{ + float i = __amdil_round_neginf_f32(x); + float r = x - i; + r = __amdil_min_f32(r, 0x1.fffffep-1f); + r = isinf(x) ? 0.0f : r; + r = isnan(x) ? x : r; + *ip = i; + return r; +} + +#if __OPENCL_C_VERSION__ < 200 + +__attribute__((overloadable, always_inline)) float +fract(float x, __local float *ip) +{ + float i; + float f = fract(x, &i); + *ip = i; + return f; +} + +__attribute__((overloadable, always_inline)) float +fract(float x, __global float *ip) +{ + float i; + float f = fract(x, &i); + *ip = i; + return f; +} + +#endif
diff --git a/amd-builtins/math32/frexpF.cl b/amd-builtins/math32/frexpF.cl new file mode 100644 index 0000000..75d41c6 --- /dev/null +++ b/amd-builtins/math32/frexpF.cl
@@ -0,0 +1,96 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline, weak)) float +frexp(float x, int *ep) +{ + int i = as_int(x); + int ai = i & 0x7fffffff; + int d = ai > 0 & ai < 0x00800000; + // scale subnormal by 2^26 without multiplying + float s = as_float(ai | 0x0d800000) - 0x1.0p-100F; + ai = d ? as_int(s) : ai; + int e = (ai >> 23) - 126 - (d ? 26 : 0); + int t = ai == 0 | e == 129; + i = (i & 0x80000000) | 0x3f000000 | (ai & 0x007fffff); + *ep = t ? 0 : e; + return t ? x : as_float(i); +} + +#if __OPENCL_C_VERSION__ < 200 + +__attribute__((overloadable, always_inline, weak)) float +frexp(float x, __local int *ep) +{ + int e; + float f = frexp(x, &e); + *ep = e; + return f; +} + +__attribute__((overloadable, always_inline, weak)) float +frexp(float x, __global int *ep) +{ + int e; + float f = frexp(x, &e); + *ep = e; + return f; +} +#endif + +__attribute__((overloadable, always_inline, weak)) float4 +frexp(float4 x, int4 *ep) +{ + int4 i = as_int4(x); + int4 ai = i & 0x7fffffff; + int4 d = ai > 0 & ai < 0x00800000; + float4 s = as_float4(ai | 0x0d800000) - 0x1.0p-100F; + ai = bitselect(ai, as_int4(s), d); + int4 e = (ai >> 23) - 126 - bitselect((int4)0, (int4)26, d); + int4 t = ai == (int4)0 | e == (int4)129; + i = (i & (int4)0x80000000) | (int4)0x3f000000 | (ai & 0x007fffff); + *ep = bitselect(e, (int4)0, t); + return bitselect(as_float4(i), x, as_float4(t)); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) float4 +frexp(float4 x, __global int4 *ep) +{ + int4 e; + float4 ret = frexp(x, &e); + *ep = e; + return ret; +} + +__attribute__((overloadable, always_inline, weak)) float4 +frexp(float4 x, __local int4 *ep) +{ + int4 e; + float4 ret = frexp(x, &e); + *ep = e; + return ret; +} +#endif
diff --git a/amd-builtins/math32/half_cosF.cl b/amd-builtins/math32/half_cosF.cl new file mode 100644 index 0000000..d8c7b9c --- /dev/null +++ b/amd-builtins/math32/half_cosF.cl
@@ -0,0 +1,46 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" +#include "remainderF_piby2.h" +#include "sincosF_piby4.h" + +__attribute__((overloadable, always_inline, weak)) float +half_cos(float x) +{ + int ix = as_int(x); + int ax = ix & 0x7fffffff; + float dx = as_float(ax); + + float r0, r1; + int regn = argReductionSmallS(&r0, &r1, dx); + + float ss = -sinf_piby4_new(r0, r1); + float cc = cosf_piby4_new(r0, r1); + float c = (regn & 1) != 0 ? ss : cc; + c = as_float(as_int(c) ^ ((regn > 1) <<31)); + + c = ax > 0x47800000 ? 1.0f : c; + c = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : c; + return c; +} +
diff --git a/amd-builtins/math32/half_divideF.cl b/amd-builtins/math32/half_divideF.cl new file mode 100644 index 0000000..b3e3d8b --- /dev/null +++ b/amd-builtins/math32/half_divideF.cl
@@ -0,0 +1,51 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline, weak)) float +half_divide(float x, float y) +{ + int c = fabs(y) > 0x1.0p+96f; + float s = c ? 0x1.0p-32f : 1.0f; + y *= s; + return s * native_divide(x, y); +} + +//__attribute__((overloadable, always_inline)) float2 +//half_divide(float2 x, float2 y) +//{ +// int2 c = fabs(y) > 0x1.0p+96f; +// float2 s = c ? 0x1.0p-32f : 1.0f; +// y *= s; +// return s * native_divide(x, y); +//} +// +//__attribute__((overloadable, always_inline)) float4 +//half_divide(float4 x, float4 y) +//{ +// int4 c = fabs(y) > 0x1.0p+96f; +// float4 s = c ? 0x1.0p-32f : 1.0f; +// y *= s; +// return s * native_divide(x, y); +//} +
diff --git a/amd-builtins/math32/half_expF.cl b/amd-builtins/math32/half_expF.cl new file mode 100644 index 0000000..bd21b76 --- /dev/null +++ b/amd-builtins/math32/half_expF.cl
@@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline, weak)) float +half_exp(float x) +{ + return native_exp(x); +} + +//__attribute__((overloadable, always_inline)) float2 +//half_exp(float2 x) +//{ +// return native_exp(x); +//} +// +//__attribute__((overloadable, always_inline)) float4 +//half_exp(float4 x) +//{ +// return native_exp(x); +//} + +__attribute__((overloadable, always_inline, weak)) float +half_exp2(float x) +{ + return native_exp2(x); +} + +//__attribute__((overloadable, always_inline)) float2 +//half_exp2(float2 x) +//{ +// return native_exp2(x); +//} +// +//__attribute__((overloadable, always_inline)) float4 +//half_exp2(float4 x) +//{ +// return native_exp2(x); +//} + +__attribute__((overloadable, always_inline, weak)) float +half_exp10(float x) +{ + return native_exp10(x); +} + +//__attribute__((overloadable, always_inline)) float2 +//half_exp10(float2 x) +//{ +// return native_exp10(x); +//} +// +//__attribute__((overloadable, always_inline)) float4 +//half_exp10(float4 x) +//{ +// return native_exp10(x); +//} +
diff --git a/amd-builtins/math32/half_logF.cl b/amd-builtins/math32/half_logF.cl new file mode 100644 index 0000000..fad736a --- /dev/null +++ b/amd-builtins/math32/half_logF.cl
@@ -0,0 +1,41 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline, weak)) float +half_log2(float x) +{ + return native_log2(x); +} + +__attribute__((overloadable, always_inline, weak)) float +half_log10(float x) +{ + return native_log10(x); +} + +__attribute__((overloadable, always_inline, weak)) float +half_log(float x) +{ + return native_log(x); +}
diff --git a/amd-builtins/math32/half_powrF.cl b/amd-builtins/math32/half_powrF.cl new file mode 100644 index 0000000..978cdeb --- /dev/null +++ b/amd-builtins/math32/half_powrF.cl
@@ -0,0 +1,36 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +half_powr(float x, float y) +{ + return powr(x, y); +} + +//__attribute__((overloadable, always_inline)) float4 +//half_powr(float4 x, float4 y) +//{ +// return powr(x, y); +//} +
diff --git a/amd-builtins/math32/half_recipF.cl b/amd-builtins/math32/half_recipF.cl new file mode 100644 index 0000000..43af15f --- /dev/null +++ b/amd-builtins/math32/half_recipF.cl
@@ -0,0 +1,55 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +extern __attribute__((pure)) float __amdil_div_f32(float,float); +//extern __attribute__((pure)) float2 __amdil_div_v2f32(float2,float2); +//extern __attribute__((pure)) float4 __amdil_div_v4f32(float4,float4); + +__attribute__((overloadable, always_inline)) float +half_recip(float x) +{ + return __amdil_div_f32(1.0f, x); +} + +//__attribute__((overloadable, always_inline)) float2 +//half_recip(float2 x) +//{ +// return __amdil_div_v2f32((float2)1.0f, x); +//} +// +//__attribute__((overloadable, always_inline)) float3 +//half_recip(float3 x) +//{ +// float3 ret; +// ret.s01 = __amdil_div_v2f32((float2)1.0f, x.s01); +// ret.s2 = __amdil_div_f32(1.0f, x.s2); +// return ret; +//} +// +//__attribute__((overloadable, always_inline)) float4 +//half_recip(float4 x) +//{ +// return __amdil_div_v4f32((float4)1.0f, x); +//} +
diff --git a/amd-builtins/math32/half_rsqrtF.cl b/amd-builtins/math32/half_rsqrtF.cl new file mode 100644 index 0000000..8268f72 --- /dev/null +++ b/amd-builtins/math32/half_rsqrtF.cl
@@ -0,0 +1,29 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline, weak)) float +half_rsqrt(float x) +{ + return native_rsqrt(x); +}
diff --git a/amd-builtins/math32/half_sinF.cl b/amd-builtins/math32/half_sinF.cl new file mode 100644 index 0000000..b1c7201 --- /dev/null +++ b/amd-builtins/math32/half_sinF.cl
@@ -0,0 +1,49 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" +#include "remainderF_piby2.h" +#include "sincosF_piby4.h" + +__attribute__((overloadable, weak)) float +half_sin(float x) +{ + int ix = as_int(x); + int ax = ix & 0x7fffffff; + + float dx = as_float(ax); + + float r0, r1; + int regn = argReductionSmallS(&r0, &r1, dx); + + float ss = sinf_piby4_new(r0, r1); + float cc = cosf_piby4_new(r0, r1); + float s = (regn & 1) != 0 ? cc : ss; + s = as_float(as_int(s) ^ ((regn > 1) << 31)); + + s = ax > 0x47800000 ? 1.0f : s; + s = as_float(as_int(s) ^ (ix ^ ax)); + s = x == 0.0f ? x : s; + s = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : s; + return s; +} +
diff --git a/amd-builtins/math32/half_sincos.cl b/amd-builtins/math32/half_sincos.cl new file mode 100644 index 0000000..31f63e4 --- /dev/null +++ b/amd-builtins/math32/half_sincos.cl
@@ -0,0 +1,58 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" +#include "remainderF_piby2.h" +#include "sincosF_piby4.h" + +// Calculate half_sin and half_cos at once + +__attribute__((always_inline, weak)) float +__hsa_half_sincos(float x, float *cp) +{ + int ix = as_int(x); + int ax = ix & 0x7fffffff; + float dx = as_float(ax); + + float r0, r1; + int regn = argReductionSmallS(&r0, &r1, dx); + + float ss = sinf_piby4_new(r0, r1); + float cc = cosf_piby4_new(r0, r1); + bool reg0 = (regn & 1) != 0; + float c = reg0 ? -ss : cc; + float s = reg0 ? cc : ss; + int xsign = ((regn > 1) << 31); + c = as_float(as_int(c) ^ xsign); + s = as_float(as_int(s) ^ xsign); + + bool is_huge = ax > 0x47800000; + c = is_huge ? 1.0f : c; + s = is_huge ? 1.0f : s; + s = as_float(as_int(s) ^ (ix ^ ax)); + s = x == 0.0f ? x : s; + bool is_inf = ax >= PINFBITPATT_SP32; + c = is_inf ? as_float(QNANBITPATT_SP32) : c; + s = is_inf ? as_float(QNANBITPATT_SP32) : s; + *cp = c; + return s; +}
diff --git a/amd-builtins/math32/half_sqrtF.cl b/amd-builtins/math32/half_sqrtF.cl new file mode 100644 index 0000000..4e3f937 --- /dev/null +++ b/amd-builtins/math32/half_sqrtF.cl
@@ -0,0 +1,29 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline, weak)) float +half_sqrt(float x) +{ + return native_sqrt(x); +}
diff --git a/amd-builtins/math32/half_tanF.cl b/amd-builtins/math32/half_tanF.cl new file mode 100644 index 0000000..1b20ba6 --- /dev/null +++ b/amd-builtins/math32/half_tanF.cl
@@ -0,0 +1,44 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" +#include "remainderF_piby2.h" +#include "tanF_piby4.h" + +__attribute__((overloadable)) float +half_tan(float x) +{ + int ix = as_int(x); + int ax = ix & 0x7fffffff; + float dx = as_float(ax); + + float r0, r1; + int regn = argReductionSmallS(&r0, &r1, dx); + + float t = tanf_piby4_new(r0, regn); + t = ix != ax ? -t : t; + t = x == 0.0f ? x : t; + t = ax > 0x47800000 ? 0.0f : t; + t = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : t; + return t; +} +
diff --git a/amd-builtins/math32/hypotF.cl b/amd-builtins/math32/hypotF.cl new file mode 100644 index 0000000..36f2931 --- /dev/null +++ b/amd-builtins/math32/hypotF.cl
@@ -0,0 +1,69 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" +#if !defined(SUBNORMALS_SUPPORTED) +#include "floattointconversion.h" +#endif //SUBNORMALS_SUPPORTED + +// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result warrants it + + + + +__attribute__((overloadable, always_inline)) float +hypot(float x, float y) +{ + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint uy = as_uint(y); + uint auy = uy & EXSIGNBIT_SP32; + float retval; + int c = aux > auy; + ux = c ? aux : auy; + uy = c ? auy : aux; + +#if !defined(SUBNORMALS_SUPPORTED) + if( as_float(uy) > 0.0 && ux < 0x7effffff) + { +#endif //SUBNORMALS_SUPPORTED + int xexp = clamp((int)(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126); + float fx_exp = as_float((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + float fx = as_float(ux) * fi_exp; + float fy = as_float(uy) * fi_exp; + retval = MATH_SQRT(mad(fx, fx, fy*fy)) * fx_exp; +#if !defined(SUBNORMALS_SUPPORTED) + } + else + { + double dy = float_uint_to_double(as_uint(y)); + double dx = float_uint_to_double(as_uint(x)); + double dretval = sqrt(dx*dx + dy*dy); + retval = as_float(double_to_float_uint(dretval)); + } +#endif //SUBNORMALS_SUPPORTED + + retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval; + retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32 ? as_float(PINFBITPATT_SP32) : retval; + return retval; +}
diff --git a/amd-builtins/math32/ilogbF.cl b/amd-builtins/math32/ilogbF.cl new file mode 100644 index 0000000..e2a737c --- /dev/null +++ b/amd-builtins/math32/ilogbF.cl
@@ -0,0 +1,37 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) int +ilogb(float x) +{ + uint ux = as_uint(x); + uint ax = ux & EXSIGNBIT_SP32; + int rs = -118 - (int)clz(ux & MANTBITS_SP32); + int r = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + r = ax < 0x00800000U ? rs : r; + r = ax > EXPBITS_SP32 | ax == 0 ? 0x80000000 : r; + r = ax == EXPBITS_SP32 ? 0x7fffffff : r; + return r; +} +
diff --git a/amd-builtins/math32/ldexpF.cl b/amd-builtins/math32/ldexpF.cl new file mode 100644 index 0000000..f6c3492 --- /dev/null +++ b/amd-builtins/math32/ldexpF.cl
@@ -0,0 +1,95 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline, weak)) float +ldexp(float x, int n) +{ + #if 0 + // This treats subnormals as zeros + int i = as_int(x); + int e = (i >> 23) & 0xff; + int m = i & 0x007fffff; + int s = i & 0x80000000; + int v = add_sat(e, n); + v = clamp(v, 0, 0xff); + int mr = e == 0 | v == 0 | v == 0xff ? 0 : m; + int c = e == 0xff; + mr = c ? m : mr; + int er = c ? e : v; + er = e ? er : e; + return as_float( s | (er << 23) | mr ); + #endif + + + /* supports denormal values */ + const int multiplier = 24; + float val_f; + uint val_ui; + uint sign; + int exponent; + val_ui = as_uint(x); + sign = val_ui & 0x80000000; + val_ui = val_ui & 0x7fffffff;/* remove the sign bit */ + int val_x = val_ui; + + exponent = val_ui >> 23; /* get the exponent */ + int dexp = exponent; + + + /* denormal support */ + int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0)) >> 23); + int dexponent = 25 - fbh; + uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23)); + int ex = dexponent + n - multiplier; + dexponent = ex; + uint val = sign | (ex << 23) | (dval_ui & 0x007fffff); + int ex1 = dexponent + multiplier; + ex1 = -ex1 +25; + dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1); + dval_ui = dexponent > 0 ? val :dval_ui; + dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui; /*overflow*/ + dval_ui = dexponent < -multiplier ? 0 : dval_ui; /*underflow*/ + dval_ui = dval_ui | sign; + val_f = as_float(dval_ui); + + exponent += n; + + val = sign | (exponent << 23) | (val_ui & 0x007fffff); + ex1 = exponent + multiplier; + ex1 = -ex1 +25; + val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1); + val_ui = exponent > 0 ? val :val_ui; + val_ui = exponent > 254 ? 0x7f800000 :val_ui; /*overflow*/ + val_ui = exponent < -multiplier ? 0 : val_ui; /*underflow*/ + val_ui = val_ui | sign; + + val_ui = dexp == 0? dval_ui : val_ui; + val_f = as_float(val_ui); + + + val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f; + return val_f; + + +}
diff --git a/amd-builtins/math32/lgammaF.cl b/amd-builtins/math32/lgammaF.cl new file mode 100644 index 0000000..b7ecd4a --- /dev/null +++ b/amd-builtins/math32/lgammaF.cl
@@ -0,0 +1,255 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +#define pi 3.1415927410e+00f /* 0x40490fdb */ + +#define a0 7.7215664089e-02f /* 0x3d9e233f */ +#define a1 3.2246702909e-01f /* 0x3ea51a66 */ +#define a2 6.7352302372e-02f /* 0x3d89f001 */ +#define a3 2.0580807701e-02f /* 0x3ca89915 */ +#define a4 7.3855509982e-03f /* 0x3bf2027e */ +#define a5 2.8905137442e-03f /* 0x3b3d6ec6 */ +#define a6 1.1927076848e-03f /* 0x3a9c54a1 */ +#define a7 5.1006977446e-04f /* 0x3a05b634 */ +#define a8 2.2086278477e-04f /* 0x39679767 */ +#define a9 1.0801156895e-04f /* 0x38e28445 */ +#define a10 2.5214456400e-05f /* 0x37d383a2 */ +#define a11 4.4864096708e-05f /* 0x383c2c75 */ + +#define tc 1.4616321325e+00f /* 0x3fbb16c3 */ + +#define tf -1.2148628384e-01f /* 0xbdf8cdcd */ +/* tt -(tail of tf) */ +#define tt 6.6971006518e-09f /* 0x31e61c52 */ + +#define t0 4.8383611441e-01f /* 0x3ef7b95e */ +#define t1 -1.4758771658e-01f /* 0xbe17213c */ +#define t2 6.4624942839e-02f /* 0x3d845a15 */ +#define t3 -3.2788541168e-02f /* 0xbd064d47 */ +#define t4 1.7970675603e-02f /* 0x3c93373d */ +#define t5 -1.0314224288e-02f /* 0xbc28fcfe */ +#define t6 6.1005386524e-03f /* 0x3bc7e707 */ +#define t7 -3.6845202558e-03f /* 0xbb7177fe */ +#define t8 2.2596477065e-03f /* 0x3b141699 */ +#define t9 -1.4034647029e-03f /* 0xbab7f476 */ +#define t10 8.8108185446e-04f /* 0x3a66f867 */ +#define t11 -5.3859531181e-04f /* 0xba0d3085 */ +#define t12 3.1563205994e-04f /* 0x39a57b6b */ +#define t13 -3.1275415677e-04f /* 0xb9a3f927 */ +#define t14 3.3552918467e-04f /* 0x39afe9f7 */ + +#define u0 -7.7215664089e-02f /* 0xbd9e233f */ +#define u1 6.3282704353e-01f /* 0x3f2200f4 */ +#define u2 1.4549225569e+00f /* 0x3fba3ae7 */ +#define u3 9.7771751881e-01f /* 0x3f7a4bb2 */ +#define u4 2.2896373272e-01f /* 0x3e6a7578 */ +#define u5 1.3381091878e-02f /* 0x3c5b3c5e */ + +#define v1 2.4559779167e+00f /* 0x401d2ebe */ +#define v2 2.1284897327e+00f /* 0x4008392d */ +#define v3 7.6928514242e-01f /* 0x3f44efdf */ +#define v4 1.0422264785e-01f /* 0x3dd572af */ +#define v5 3.2170924824e-03f /* 0x3b52d5db */ + +#define s0 -7.7215664089e-02f /* 0xbd9e233f */ +#define s1 2.1498242021e-01f /* 0x3e5c245a */ +#define s2 3.2577878237e-01f /* 0x3ea6cc7a */ +#define s3 1.4635047317e-01f /* 0x3e15dce6 */ +#define s4 2.6642270386e-02f /* 0x3cda40e4 */ +#define s5 1.8402845599e-03f /* 0x3af135b4 */ +#define s6 3.1947532989e-05f /* 0x3805ff67 */ + +#define r1 1.3920053244e+00f /* 0x3fb22d3b */ +#define r2 7.2193557024e-01f /* 0x3f38d0c5 */ +#define r3 1.7193385959e-01f /* 0x3e300f6e */ +#define r4 1.8645919859e-02f /* 0x3c98bf54 */ +#define r5 7.7794247773e-04f /* 0x3a4beed6 */ +#define r6 7.3266842264e-06f /* 0x36f5d7bd */ + +#define w0 4.1893854737e-01f /* 0x3ed67f1d */ +#define w1 8.3333335817e-02f /* 0x3daaaaab */ +#define w2 -2.7777778450e-03f /* 0xbb360b61 */ +#define w3 7.9365057172e-04f /* 0x3a500cfd */ +#define w4 -5.9518753551e-04f /* 0xba1c065c */ +#define w5 8.3633989561e-04f /* 0x3a5b3dd2 */ +#define w6 -1.6309292987e-03f /* 0xbad5c4e8 */ + +__attribute__ ((overloadable, always_inline)) float +lgamma_r(float x, int *signp) +{ + int hx = as_int(x); + int ix = hx & 0x7fffffff; + float absx = as_float(ix); + + if (ix >= 0x7f800000) { + *signp = 1; + return x; + } + + if (absx < 0x1.0p-70f) { + *signp = hx < 0 ? -1 : 1; + return -log(absx); + } + + float r; + + if (absx == 1.0f | absx == 2.0f) + r = 0.0f; + + else if (absx < 2.0f) { + float y = 2.0f - absx; + int i = 0; + + int c = absx < 0x1.bb4c30p+0f; + float yt = absx - tc; + y = c ? yt : y; + i = c ? 1 : i; + + c = absx < 0x1.3b4c40p+0f; + yt = absx - 1.0f; + y = c ? yt : y; + i = c ? 2 : i; + + r = -log(absx); + yt = 1.0f - absx; + c = absx <= 0x1.ccccccp-1f; + r = c ? r : 0.0f; + y = c ? yt : y; + i = c ? 0 : i; + + c = absx < 0x1.769440p-1f; + yt = absx - (tc - 1.0f); + y = c ? yt : y; + i = c ? 1 : i; + + c = absx < 0x1.da6610p-3f; + y = c ? absx : y; + i = c ? 2 : i; + + float z, w, p1, p2, p3, p; + switch (i) { + case 0: + z = y * y; + p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0); + p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1); + p = mad(y, p1, p2); + r += mad(y, -0.5f, p); + break; + case 1: + z = y * y; + w = z * y; + p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0); + p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1); + p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2); + p = mad(z, p1, -mad(w, -mad(y, p3, p2), tt)); + r += tf + p; + break; + case 2: + p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0); + p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), 1.0f); + r += mad(y, -0.5f, MATH_DIVIDE(p1, p2)); + break; + } + } else if (absx < 8.0f) { + int i = (int)absx; + float y = absx - (float) i; + float p = y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0); + float q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), 1.0f); + r = mad(y, 0.5f, MATH_DIVIDE(p, q)); + + float y6 = y + 6.0f; + float y5 = y + 5.0f; + float y4 = y + 4.0f; + float y3 = y + 3.0f; + float y2 = y + 2.0f; + + float z = 1.0f; + z *= i > 6 ? y6 : 1.0f; + z *= i > 5 ? y5 : 1.0f; + z *= i > 4 ? y4 : 1.0f; + z *= i > 3 ? y3 : 1.0f; + z *= i > 2 ? y2 : 1.0f; + + r += log(z); + } else if (absx < 0x1.0p+58f) { + float z = 1.0f / absx; + float y = z * z; + float w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0); + r = mad(absx - 0.5f, log(absx) - 1.0f, w); + } else + // 2**58 <= x <= Inf + r = absx * (log(absx) - 1.0f); + + int s = 1; + + if (x < 0.0f) { + float t = sinpi(x); + r = log(pi / fabs(t * x)) - r; + r = t == 0.0f ? as_float(PINFBITPATT_SP32) : r; + s = t < 0.0f ? -1 : s; + } + + *signp = s; + return r; +} + +#if __OPENCL_C_VERSION__ < 200 +__attribute__ ((overloadable, always_inline)) float +lgamma_r(float x, __local int *signp) +{ + int s; + float l = lgamma_r(x, &s); + *signp = s; + return l; +} + +__attribute__ ((overloadable, always_inline)) float +lgamma_r(float x, __global int *signp) +{ + int s; + float l = lgamma_r(x, &s); + *signp = s; + return l; +} +#endif + +__attribute__ ((overloadable, always_inline)) float +lgamma(float x) +{ + int s; + float l = lgamma_r(x, &s); + return l; +} +
diff --git a/amd-builtins/math32/log10F.cl b/amd-builtins/math32/log10F.cl new file mode 100644 index 0000000..b418fea --- /dev/null +++ b/amd-builtins/math32/log10F.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_LOG10 +#include "logF_base.h" +
diff --git a/amd-builtins/math32/log1pF.cl b/amd-builtins/math32/log1pF.cl new file mode 100644 index 0000000..eb2b059 --- /dev/null +++ b/amd-builtins/math32/log1pF.cl
@@ -0,0 +1,91 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +log1p(float x) +{ + USE_TABLE(float2, p_log, LOGE_TBL); + USE_TABLE(float, p_inv, LOG_INV_TBL); + + float w = x; + uint ux = as_uint(x); + uint ax = ux & EXSIGNBIT_SP32; + + // |x| < 2^-4 + float u2 = MATH_DIVIDE(x, 2.0f + x); + float u = u2 + u2; + float v = u * u; + // 2/(5 * 2^5), 2/(3 * 2^3) + float zsmall = mad(-u2, x, mad(v, 0x1.99999ap-7f, 0x1.555556p-4f) * v * u) + x; + + // |x| >= 2^-4 + //x = x + 1.0f; + ux = as_uint(x + 1.0f); + + int m = (int)((ux >> EXPSHIFTBITS_SP32) & 0xff) - EXPBIAS_SP32; + float mf = (float)m; + uint indx = (ux & 0x007f0000) + ((ux & 0x00008000) << 1); + float F = as_float(indx | 0x3f000000); + + // x > 2^24 + float fg24 = F - as_float(0x3f000000 | (ux & MANTBITS_SP32)); + + // x <= 2^24 + uint xhi = ux & 0xffff8000; + float xh = as_float(xhi); + float xt = (1.0f - xh) + w; + uint xnm = ((~(xhi & 0x7f800000)) - 0x00800000) & 0x7f800000; + xt = xt * as_float(xnm) * 0.5f; + float fl24 = F - as_float(0x3f000000 | (xhi & MANTBITS_SP32)) - xt; + + float f = mf > 24.0f ? fg24 : fl24; + + indx = indx >> 16; + float r = f * p_inv[indx]; + + // 1/3, 1/2 + float poly = mad(mad(r, 0x1.555556p-2f, 0x1.0p-1f), r*r, r); + + const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234 + const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 + + float2 tv = p_log[indx]; + float z1 = mad(mf, LOG2_HEAD, tv.s0); + float z2 = mad(mf, LOG2_TAIL, -poly) + tv.s1; + float z = z1 + z2; + + z = ax < 0x3d800000U ? zsmall : z; + + + + // Edge cases + z = ax >= PINFBITPATT_SP32 ? w : z; + z = w < -1.0f ? as_float(QNANBITPATT_SP32) : z; + z = w == -1.0f ? as_float(NINFBITPATT_SP32) : z; + //fix subnormals + z = ax < 0x33800000 ? x : z; + + return z; +} +
diff --git a/amd-builtins/math32/log2F.cl b/amd-builtins/math32/log2F.cl new file mode 100644 index 0000000..a90e149 --- /dev/null +++ b/amd-builtins/math32/log2F.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_LOG2 +#include "logF_base.h" +
diff --git a/amd-builtins/math32/logF.cl b/amd-builtins/math32/logF.cl new file mode 100644 index 0000000..79fb03e --- /dev/null +++ b/amd-builtins/math32/logF.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_LOG +#include "logF_base.h" +
diff --git a/amd-builtins/math32/logF_base.h b/amd-builtins/math32/logF_base.h new file mode 100644 index 0000000..9482247 --- /dev/null +++ b/amd-builtins/math32/logF_base.h
@@ -0,0 +1,181 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +/* + Algorithm: + + Based on: + Ping-Tak Peter Tang + "Table-driven implementation of the logarithm function in IEEE + floating-point arithmetic" + ACM Transactions on Mathematical Software (TOMS) + Volume 16, Issue 4 (December 1990) + + + x very close to 1.0 is handled differently, for x everywhere else + a brief explanation is given below + + x = (2^m)*A + x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8)) + x = (2^m)*2*(G/2+g/2) + x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9)) + + Y = (2^(-1))*(2^(-m))*(2^m)*A + Now, range of Y is: 0.5 <= Y < 1 + + F = 0x80 + (first 7 mantissa bits) + (8th mantissa bit) + Now, range of F is: 128 <= F <= 256 + F = F / 256 + Now, range of F is: 0.5 <= F <= 1 + + f = -(Y-F), with (f <= 2^(-9)) + + log(x) = m*log(2) + log(2) + log(F-f) + log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F)) + log(x) = m*log(2) + log(2*F) + log(1-r) + + r = (f/F), with (r <= 2^(-8)) + r = f*(1/F) with (1/F) precomputed to avoid division + + log(x) = m*log(2) + log(G) - poly + + log(G) is precomputed + poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5)) + + log(2) and log(G) need to be maintained in extra precision + to avoid losing precision in the calculations + + + For x close to 1.0, we employ the following technique to + ensure faster convergence. + + log(x) = log((1+s)/(1-s)) = 2*s + (2/3)*s^3 + (2/5)*s^5 + (2/7)*s^7 + x = ((1+s)/(1-s)) + x = 1 + r + s = r/(2+r) + +*/ + +__attribute__((overloadable, weak)) float +#if defined(COMPILING_LOG2) +log2(float x) +#elif defined(COMPILING_LOG10) +log10(float x) +#else +log(float x) +#endif +{ + USE_TABLE(float, p_inv, LOG_INV_TBL); + +#if defined(COMPILING_LOG2) + USE_TABLE(float2, p_log, LOG2_TBL); + const float LOG2E = 0x1.715476p+0f; // 1.4426950408889634 + const float LOG2E_HEAD = 0x1.700000p+0f; // 1.4375 + const float LOG2E_TAIL = 0x1.547652p-8f; // 0.00519504072 +#elif defined(COMPILING_LOG10) + USE_TABLE(float2, p_log, LOG10_TBL); + const float LOG10E = 0x1.bcb7b2p-2f; // 0.43429448190325182 + const float LOG10E_HEAD = 0x1.bc0000p-2f; // 0.43359375 + const float LOG10E_TAIL = 0x1.6f62a4p-11f; // 0.0007007319 + const float LOG10_2_HEAD = 0x1.340000p-2f; // 0.30078125 + const float LOG10_2_TAIL = 0x1.04d426p-12f; // 0.000248745637 +#else + USE_TABLE(float2, p_log, LOGE_TBL); + const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234 + const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 +#endif + + uint xi = as_uint(x); + uint ax = xi & EXSIGNBIT_SP32; + + // Calculations for |x-1| < 2^-4 + float r = x - 1.0f; + int near1 = fabs(r) < 0x1.0p-4f; + float u2 = MATH_DIVIDE(r, 2.0f + r); + float corr = u2 * r; + float u = u2 + u2; + float v = u * u; + float znear1, z1, z2; + + // 2/(5 * 2^5), 2/(3 * 2^3) + z2 = mad(u, mad(v, 0x1.99999ap-7f, 0x1.555556p-4f)*v, -corr); + +#if defined(COMPILING_LOG2) + z1 = as_float(as_int(r) & 0xffff0000); + z2 = z2 + (r - z1); + znear1 = mad(z1, LOG2E_HEAD, mad(z2, LOG2E_HEAD, mad(z1, LOG2E_TAIL, z2*LOG2E_TAIL))); +#elif defined(COMPILING_LOG10) + z1 = as_float(as_int(r) & 0xffff0000); + z2 = z2 + (r - z1); + znear1 = mad(z1, LOG10E_HEAD, mad(z2, LOG10E_HEAD, mad(z1, LOG10E_TAIL, z2*LOG10E_TAIL))); +#else + znear1 = z2 + r; +#endif + + // Calculations for x not near 1 + int m = (int)(xi >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + + // Normalize subnormal + uint xis = as_uint(as_float(xi | 0x3f800000) - 1.0f); + int ms = (int)(xis >> EXPSHIFTBITS_SP32) - 253; + int c = m == -127; + m = c ? ms : m; + uint xin = c ? xis : xi; + + float mf = (float)m; + uint indx = (xin & 0x007f0000) + ((xin & 0x00008000) << 1); + + // F - Y + float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (xin & MANTBITS_SP32)); + + indx = indx >> 16; + r = f * p_inv[indx]; + + // 1/3, 1/2 + float poly = mad(mad(r, 0x1.555556p-2f, 0.5f), r*r, r); + + float2 tv = p_log[indx]; + +#if defined(COMPILING_LOG2) + z1 = tv.s0 + mf; + z2 = mad(poly, -LOG2E, tv.s1); +#elif defined(COMPILING_LOG10) + z1 = mad(mf, LOG10_2_HEAD, tv.s0); + z2 = mad(poly, -LOG10E, mf*LOG10_2_TAIL) + tv.s1; +#else + z1 = mad(mf, LOG2_HEAD, tv.s0); + z2 = mad(mf, LOG2_TAIL, -poly) + tv.s1; +#endif + + float z = z1 + z2; + z = near1 ? znear1 : z; + + // Corner cases + z = ax >= PINFBITPATT_SP32 ? x : z; + z = xi != ax ? as_float(QNANBITPATT_SP32) : z; + z = ax == 0 ? as_float(NINFBITPATT_SP32) : z; + + return z; +} +
diff --git a/amd-builtins/math32/logF_table.h b/amd-builtins/math32/logF_table.h new file mode 100644 index 0000000..4b52129 --- /dev/null +++ b/amd-builtins/math32/logF_table.h
@@ -0,0 +1,682 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +DECLARE_TABLE(float2, LOG2_TBL, 129, + (float2)(0x0.000000p+0f, 0x0.000000p+0f), + (float2)(0x1.6f8000p-7f, 0x1.942dbap-17f), + (float2)(0x1.6e0000p-6f, 0x1.e5a170p-16f), + (float2)(0x1.118000p-5f, 0x1.347544p-15f), + (float2)(0x1.6b8000p-5f, 0x1.69bac6p-16f), + (float2)(0x1.c48000p-5f, 0x1.7eae42p-15f), + (float2)(0x1.0e8000p-4f, 0x1.9c4fd0p-15f), + (float2)(0x1.3a8000p-4f, 0x1.17ee92p-15f), + (float2)(0x1.660000p-4f, 0x1.fb7d64p-15f), + (float2)(0x1.918000p-4f, 0x1.42dc8cp-17f), + (float2)(0x1.bc8000p-4f, 0x1.0902b6p-18f), + (float2)(0x1.e70000p-4f, 0x1.7608bep-15f), + (float2)(0x1.088000p-3f, 0x1.162336p-13f), + (float2)(0x1.1d8000p-3f, 0x1.3465d4p-13f), + (float2)(0x1.328000p-3f, 0x1.74f13cp-14f), + (float2)(0x1.470000p-3f, 0x1.aa7e60p-13f), + (float2)(0x1.5c0000p-3f, 0x1.a39fbcp-19f), + (float2)(0x1.700000p-3f, 0x1.d0b53ap-13f), + (float2)(0x1.848000p-3f, 0x1.0af40ap-13f), + (float2)(0x1.988000p-3f, 0x1.b741dep-13f), + (float2)(0x1.ac8000p-3f, 0x1.d78b6cp-13f), + (float2)(0x1.c08000p-3f, 0x1.6db376p-13f), + (float2)(0x1.d48000p-3f, 0x1.ee4c32p-15f), + (float2)(0x1.e80000p-3f, 0x1.02f9d2p-13f), + (float2)(0x1.fb8000p-3f, 0x1.05ae40p-13f), + (float2)(0x1.078000p-2f, 0x1.0adbb0p-14f), + (float2)(0x1.110000p-2f, 0x1.83ed68p-13f), + (float2)(0x1.1a8000p-2f, 0x1.016ca4p-12f), + (float2)(0x1.240000p-2f, 0x1.01eac2p-12f), + (float2)(0x1.2d8000p-2f, 0x1.887e26p-13f), + (float2)(0x1.370000p-2f, 0x1.24cea4p-14f), + (float2)(0x1.400000p-2f, 0x1.918ec6p-12f), + (float2)(0x1.498000p-2f, 0x1.3c25e6p-13f), + (float2)(0x1.528000p-2f, 0x1.6f7f12p-12f), + (float2)(0x1.5c0000p-2f, 0x1.a39fbcp-18f), + (float2)(0x1.650000p-2f, 0x1.8fe466p-14f), + (float2)(0x1.6e0000p-2f, 0x1.10e6cep-13f), + (float2)(0x1.770000p-2f, 0x1.d2ba7ep-14f), + (float2)(0x1.800000p-2f, 0x1.4ac62cp-15f), + (float2)(0x1.888000p-2f, 0x1.a71cb8p-12f), + (float2)(0x1.918000p-2f, 0x1.dd448ep-13f), + (float2)(0x1.9a8000p-2f, 0x1.1c8f10p-21f), + (float2)(0x1.a30000p-2f, 0x1.bb053ep-13f), + (float2)(0x1.ab8000p-2f, 0x1.861e5ep-12f), + (float2)(0x1.b40000p-2f, 0x1.fafdcep-12f), + (float2)(0x1.bd0000p-2f, 0x1.e5d3cep-15f), + (float2)(0x1.c58000p-2f, 0x1.2fad28p-14f), + (float2)(0x1.ce0000p-2f, 0x1.492474p-15f), + (float2)(0x1.d60000p-2f, 0x1.d4f80cp-12f), + (float2)(0x1.de8000p-2f, 0x1.4ff510p-12f), + (float2)(0x1.e70000p-2f, 0x1.3550f2p-13f), + (float2)(0x1.ef0000p-2f, 0x1.b59ccap-12f), + (float2)(0x1.f78000p-2f, 0x1.42b464p-13f), + (float2)(0x1.ff8000p-2f, 0x1.5e66a0p-12f), + (float2)(0x1.038000p-1f, 0x1.f6a2e4p-11f), + (float2)(0x1.080000p-1f, 0x1.39e4fep-14f), + (float2)(0x1.0c0000p-1f, 0x1.0500d6p-13f), + (float2)(0x1.100000p-1f, 0x1.13b152p-13f), + (float2)(0x1.140000p-1f, 0x1.93f542p-14f), + (float2)(0x1.180000p-1f, 0x1.467b94p-16f), + (float2)(0x1.1b8000p-1f, 0x1.cc47a4p-11f), + (float2)(0x1.1f8000p-1f, 0x1.78f4c2p-11f), + (float2)(0x1.238000p-1f, 0x1.107508p-11f), + (float2)(0x1.278000p-1f, 0x1.2602c2p-12f), + (float2)(0x1.2b8000p-1f, 0x1.a39fbcp-20f), + (float2)(0x1.2f0000p-1f, 0x1.5a1d7ap-11f), + (float2)(0x1.330000p-1f, 0x1.3e355ap-12f), + (float2)(0x1.368000p-1f, 0x1.cffedap-11f), + (float2)(0x1.3a8000p-1f, 0x1.d9fd50p-12f), + (float2)(0x1.3e0000p-1f, 0x1.f64de6p-11f), + (float2)(0x1.420000p-1f, 0x1.d83f4cp-12f), + (float2)(0x1.458000p-1f, 0x1.cea628p-11f), + (float2)(0x1.498000p-1f, 0x1.3c25e6p-12f), + (float2)(0x1.4d0000p-1f, 0x1.5a96ccp-11f), + (float2)(0x1.510000p-1f, 0x1.18708ap-17f), + (float2)(0x1.548000p-1f, 0x1.374652p-12f), + (float2)(0x1.580000p-1f, 0x1.2089a6p-11f), + (float2)(0x1.5b8000p-1f, 0x1.93432cp-11f), + (float2)(0x1.5f0000p-1f, 0x1.f3fd06p-11f), + (float2)(0x1.630000p-1f, 0x1.0b8f54p-13f), + (float2)(0x1.668000p-1f, 0x1.004722p-12f), + (float2)(0x1.6a0000p-1f, 0x1.57cf2cp-12f), + (float2)(0x1.6d8000p-1f, 0x1.8cb53ap-12f), + (float2)(0x1.710000p-1f, 0x1.9f4d8ap-12f), + (float2)(0x1.748000p-1f, 0x1.8feb26p-12f), + (float2)(0x1.780000p-1f, 0x1.5edfeep-12f), + (float2)(0x1.7b8000p-1f, 0x1.0c7c9ap-12f), + (float2)(0x1.7f0000p-1f, 0x1.322182p-13f), + (float2)(0x1.828000p-1f, 0x1.3ab7cep-18f), + (float2)(0x1.858000p-1f, 0x1.a82c2cp-11f), + (float2)(0x1.890000p-1f, 0x1.3dd2c0p-11f), + (float2)(0x1.8c8000p-1f, 0x1.871da4p-12f), + (float2)(0x1.900000p-1f, 0x1.cc2c00p-14f), + (float2)(0x1.930000p-1f, 0x1.9fdb68p-11f), + (float2)(0x1.968000p-1f, 0x1.ed6956p-12f), + (float2)(0x1.9a0000p-1f, 0x1.f1a760p-14f), + (float2)(0x1.9d0000p-1f, 0x1.767f54p-11f), + (float2)(0x1.a08000p-1f, 0x1.3f6d26p-12f), + (float2)(0x1.a38000p-1f, 0x1.b9fce2p-11f), + (float2)(0x1.a70000p-1f, 0x1.8ae816p-12f), + (float2)(0x1.aa0000p-1f, 0x1.c23d60p-11f), + (float2)(0x1.ad8000p-1f, 0x1.60f388p-12f), + (float2)(0x1.b08000p-1f, 0x1.9049aep-11f), + (float2)(0x1.b40000p-1f, 0x1.8734a8p-13f), + (float2)(0x1.b70000p-1f, 0x1.2523d4p-11f), + (float2)(0x1.ba0000p-1f, 0x1.da6ce6p-11f), + (float2)(0x1.bd8000p-1f, 0x1.038e62p-12f), + (float2)(0x1.c08000p-1f, 0x1.1b511ep-11f), + (float2)(0x1.c38000p-1f, 0x1.a728b8p-11f), + (float2)(0x1.c70000p-1f, 0x1.2b5d22p-14f), + (float2)(0x1.ca0000p-1f, 0x1.2c6e54p-12f), + (float2)(0x1.cd0000p-1f, 0x1.f35064p-12f), + (float2)(0x1.d00000p-1f, 0x1.4fdb48p-11f), + (float2)(0x1.d30000p-1f, 0x1.98ec9ep-11f), + (float2)(0x1.d60000p-1f, 0x1.d4f80cp-11f), + (float2)(0x1.d98000p-1f, 0x1.0643d6p-17f), + (float2)(0x1.dc8000p-1f, 0x1.33567ep-14f), + (float2)(0x1.df8000p-1f, 0x1.e0410cp-14f), + (float2)(0x1.e28000p-1f, 0x1.142e0ep-13f), + (float2)(0x1.e58000p-1f, 0x1.063c88p-13f), + (float2)(0x1.e88000p-1f, 0x1.8d66c4p-14f), + (float2)(0x1.eb8000p-1f, 0x1.57e32ap-15f), + (float2)(0x1.ee0000p-1f, 0x1.ed1c6cp-11f), + (float2)(0x1.f10000p-1f, 0x1.b8a076p-11f), + (float2)(0x1.f40000p-1f, 0x1.7822f2p-11f), + (float2)(0x1.f70000p-1f, 0x1.2bbc3ap-11f), + (float2)(0x1.fa0000p-1f, 0x1.a708bap-12f), + (float2)(0x1.fd0000p-1f, 0x1.be4c7ep-13f), + (float2)(0x1.000000p+0f, 0x0.000000p+0f), +) + +DECLARE_TABLE(float2, LOG10_TBL, 129, + (float2)(0x0.000000p+0f, 0x0.000000p+0f), + (float2)(0x1.ba8000p-9f, 0x1.f51c88p-19f), + (float2)(0x1.b90000p-8f, 0x1.1da93ep-18f), + (float2)(0x1.498000p-7f, 0x1.8428a2p-18f), + (float2)(0x1.b58000p-7f, 0x1.a423acp-17f), + (float2)(0x1.108000p-6f, 0x1.41d422p-17f), + (float2)(0x1.458000p-6f, 0x1.d3d6b2p-16f), + (float2)(0x1.7a8000p-6f, 0x1.70f7cep-16f), + (float2)(0x1.af0000p-6f, 0x1.7e4ac0p-16f), + (float2)(0x1.e38000p-6f, 0x1.ab2f40p-24f), + (float2)(0x1.0b8000p-5f, 0x1.00d40ap-16f), + (float2)(0x1.250000p-5f, 0x1.40b03ep-15f), + (float2)(0x1.3e8000p-5f, 0x1.446668p-15f), + (float2)(0x1.580000p-5f, 0x1.1c7758p-16f), + (float2)(0x1.710000p-5f, 0x1.20d09ep-15f), + (float2)(0x1.8a0000p-5f, 0x1.fd6f5cp-16f), + (float2)(0x1.a30000p-5f, 0x1.53ac12p-18f), + (float2)(0x1.bb8000p-5f, 0x1.4d02c6p-16f), + (float2)(0x1.d40000p-5f, 0x1.d5164ep-17f), + (float2)(0x1.ec0000p-5f, 0x1.991facp-15f), + (float2)(0x1.020000p-4f, 0x1.0a307cp-14f), + (float2)(0x1.0e0000p-4f, 0x1.e94ec0p-15f), + (float2)(0x1.1a0000p-4f, 0x1.1a22a8p-15f), + (float2)(0x1.258000p-4f, 0x1.d4857ap-14f), + (float2)(0x1.318000p-4f, 0x1.982ae2p-15f), + (float2)(0x1.3d0000p-4f, 0x1.74cd70p-14f), + (float2)(0x1.488000p-4f, 0x1.cfb476p-14f), + (float2)(0x1.540000p-4f, 0x1.ddcc64p-14f), + (float2)(0x1.5f8000p-4f, 0x1.a01222p-14f), + (float2)(0x1.6b0000p-4f, 0x1.177dbcp-14f), + (float2)(0x1.768000p-4f, 0x1.140a24p-16f), + (float2)(0x1.818000p-4f, 0x1.298f40p-14f), + (float2)(0x1.8c8000p-4f, 0x1.c60e20p-14f), + (float2)(0x1.980000p-4f, 0x1.b65052p-18f), + (float2)(0x1.a30000p-4f, 0x1.53ac12p-17f), + (float2)(0x1.ad8000p-4f, 0x1.f41d04p-14f), + (float2)(0x1.b88000p-4f, 0x1.7934eap-14f), + (float2)(0x1.c38000p-4f, 0x1.75252ep-15f), + (float2)(0x1.ce0000p-4f, 0x1.b90790p-14f), + (float2)(0x1.d90000p-4f, 0x1.d5866ap-16f), + (float2)(0x1.e38000p-4f, 0x1.e0d586p-15f), + (float2)(0x1.ee0000p-4f, 0x1.2ae984p-14f), + (float2)(0x1.f88000p-4f, 0x1.25a0d0p-14f), + (float2)(0x1.018000p-3f, 0x1.c2a064p-15f), + (float2)(0x1.068000p-3f, 0x1.2f59e8p-13f), + (float2)(0x1.0b8000p-3f, 0x1.cf424cp-13f), + (float2)(0x1.110000p-3f, 0x1.42f080p-15f), + (float2)(0x1.160000p-3f, 0x1.684156p-14f), + (float2)(0x1.1b0000p-3f, 0x1.f38f64p-14f), + (float2)(0x1.200000p-3f, 0x1.22077ap-13f), + (float2)(0x1.250000p-3f, 0x1.2d34d6p-13f), + (float2)(0x1.2a0000p-3f, 0x1.1ba328p-13f), + (float2)(0x1.2f0000p-3f, 0x1.db48e2p-14f), + (float2)(0x1.340000p-3f, 0x1.4712a0p-14f), + (float2)(0x1.390000p-3f, 0x1.ed0894p-16f), + (float2)(0x1.3d8000p-3f, 0x1.bc39b6p-13f), + (float2)(0x1.428000p-3f, 0x1.1f9ff8p-13f), + (float2)(0x1.478000p-3f, 0x1.a07d3ap-15f), + (float2)(0x1.4c0000p-3f, 0x1.9601fap-13f), + (float2)(0x1.510000p-3f, 0x1.532214p-14f), + (float2)(0x1.558000p-3f, 0x1.a31462p-13f), + (float2)(0x1.5a8000p-3f, 0x1.05a584p-14f), + (float2)(0x1.5f0000p-3f, 0x1.4911c8p-13f), + (float2)(0x1.638000p-3f, 0x1.f615fep-13f), + (float2)(0x1.688000p-3f, 0x1.1445b0p-14f), + (float2)(0x1.6d0000p-3f, 0x1.057abcp-13f), + (float2)(0x1.718000p-3f, 0x1.685f0ap-13f), + (float2)(0x1.760000p-3f, 0x1.b31022p-13f), + (float2)(0x1.7a8000p-3f, 0x1.e5cd62p-13f), + (float2)(0x1.7f8000p-3f, 0x1.aa6ca8p-22f), + (float2)(0x1.840000p-3f, 0x1.1944bcp-19f), + (float2)(0x1.880000p-3f, 0x1.f0b980p-13f), + (float2)(0x1.8c8000p-3f, 0x1.c60e20p-13f), + (float2)(0x1.910000p-3f, 0x1.849daep-13f), + (float2)(0x1.958000p-3f, 0x1.2ca202p-13f), + (float2)(0x1.9a0000p-3f, 0x1.7ca842p-14f), + (float2)(0x1.9e8000p-3f, 0x1.cf6180p-16f), + (float2)(0x1.a28000p-3f, 0x1.9fa186p-13f), + (float2)(0x1.a70000p-3f, 0x1.df5554p-14f), + (float2)(0x1.ab8000p-3f, 0x1.51eaccp-16f), + (float2)(0x1.af8000p-3f, 0x1.4f8e88p-13f), + (float2)(0x1.b40000p-3f, 0x1.7f49aap-15f), + (float2)(0x1.b80000p-3f, 0x1.5b3c72p-13f), + (float2)(0x1.bc8000p-3f, 0x1.07fd5cp-15f), + (float2)(0x1.c08000p-3f, 0x1.144d18p-13f), + (float2)(0x1.c48000p-3f, 0x1.d25700p-13f), + (float2)(0x1.c90000p-3f, 0x1.f1369ep-15f), + (float2)(0x1.cd0000p-3f, 0x1.1260fap-13f), + (float2)(0x1.d10000p-3f, 0x1.94c038p-13f), + (float2)(0x1.d58000p-3f, 0x1.ccfdb8p-20f), + (float2)(0x1.d98000p-3f, 0x1.7c70dap-15f), + (float2)(0x1.dd8000p-3f, 0x1.4ee87ap-14f), + (float2)(0x1.e18000p-3f, 0x1.b99d86p-14f), + (float2)(0x1.e58000p-3f, 0x1.feafc0p-14f), + (float2)(0x1.e98000p-3f, 0x1.0f3b16p-13f), + (float2)(0x1.ed8000p-3f, 0x1.0ca34cp-13f), + (float2)(0x1.f18000p-3f, 0x1.ef75b2p-14f), + (float2)(0x1.f58000p-3f, 0x1.a15704p-14f), + (float2)(0x1.f98000p-3f, 0x1.2f3cfap-14f), + (float2)(0x1.fd8000p-3f, 0x1.32f1dcp-15f), + (float2)(0x1.008000p-2f, 0x1.f02d90p-13f), + (float2)(0x1.028000p-2f, 0x1.821964p-13f), + (float2)(0x1.048000p-2f, 0x1.02a708p-13f), + (float2)(0x1.068000p-2f, 0x1.c7f450p-15f), + (float2)(0x1.080000p-2f, 0x1.e820cap-12f), + (float2)(0x1.0a0000p-2f, 0x1.8ecd14p-12f), + (float2)(0x1.0c0000p-2f, 0x1.2d15f4p-12f), + (float2)(0x1.0e0000p-2f, 0x1.861b72p-13f), + (float2)(0x1.100000p-2f, 0x1.4319e6p-14f), + (float2)(0x1.118000p-2f, 0x1.d6520ep-12f), + (float2)(0x1.138000p-2f, 0x1.53c218p-12f), + (float2)(0x1.158000p-2f, 0x1.925000p-13f), + (float2)(0x1.178000p-2f, 0x1.b4a7a2p-15f), + (float2)(0x1.190000p-2f, 0x1.9c19eep-12f), + (float2)(0x1.1b0000p-2f, 0x1.f38f64p-13f), + (float2)(0x1.1d0000p-2f, 0x1.3ebb32p-14f), + (float2)(0x1.1e8000p-2f, 0x1.9ddf96p-12f), + (float2)(0x1.208000p-2f, 0x1.c8d472p-13f), + (float2)(0x1.228000p-2f, 0x1.1af536p-15f), + (float2)(0x1.240000p-2f, 0x1.5acca0p-12f), + (float2)(0x1.260000p-2f, 0x1.158770p-13f), + (float2)(0x1.278000p-2f, 0x1.b35350p-12f), + (float2)(0x1.298000p-2f, 0x1.a91532p-13f), + (float2)(0x1.2b0000p-2f, 0x1.ee7896p-12f), + (float2)(0x1.2d0000p-2f, 0x1.012c1cp-12f), + (float2)(0x1.2f0000p-2f, 0x1.967ab4p-17f), + (float2)(0x1.308000p-2f, 0x1.111e3cp-12f), + (float2)(0x1.328000p-2f, 0x1.cf340ep-17f), + (float2)(0x1.340000p-2f, 0x1.04d426p-12f), +) + +DECLARE_TABLE(float2, LOGE_TBL, 129, + (float2)(0x0.000000p+0f, 0x0.000000p+0f), + (float2)(0x1.fe0000p-8f, 0x1.535882p-23f), + (float2)(0x1.fc0000p-7f, 0x1.5161f8p-20f), + (float2)(0x1.7b8000p-6f, 0x1.1b07d4p-18f), + (float2)(0x1.f82000p-6f, 0x1.361cf0p-19f), + (float2)(0x1.39e000p-5f, 0x1.0f73fcp-18f), + (float2)(0x1.774000p-5f, 0x1.63d8cap-19f), + (float2)(0x1.b42000p-5f, 0x1.bae232p-18f), + (float2)(0x1.f0a000p-5f, 0x1.86008ap-20f), + (float2)(0x1.164000p-4f, 0x1.36eea2p-16f), + (float2)(0x1.340000p-4f, 0x1.d7961ap-16f), + (float2)(0x1.51a000p-4f, 0x1.073f06p-16f), + (float2)(0x1.6f0000p-4f, 0x1.a515cap-17f), + (float2)(0x1.8c2000p-4f, 0x1.45d630p-16f), + (float2)(0x1.a92000p-4f, 0x1.b4e92ap-18f), + (float2)(0x1.c5e000p-4f, 0x1.523d6ep-18f), + (float2)(0x1.e26000p-4f, 0x1.076e2ap-16f), + (float2)(0x1.fec000p-4f, 0x1.2263b6p-17f), + (float2)(0x1.0d6000p-3f, 0x1.7e7cd0p-15f), + (float2)(0x1.1b6000p-3f, 0x1.2ad52ep-15f), + (float2)(0x1.294000p-3f, 0x1.52f81ep-15f), + (float2)(0x1.370000p-3f, 0x1.fc201ep-15f), + (float2)(0x1.44c000p-3f, 0x1.2b6ccap-15f), + (float2)(0x1.526000p-3f, 0x1.cbc742p-16f), + (float2)(0x1.5fe000p-3f, 0x1.3070a6p-15f), + (float2)(0x1.6d6000p-3f, 0x1.fce33ap-20f), + (float2)(0x1.7aa000p-3f, 0x1.890210p-15f), + (float2)(0x1.87e000p-3f, 0x1.a06520p-15f), + (float2)(0x1.952000p-3f, 0x1.6a73d0p-17f), + (float2)(0x1.a22000p-3f, 0x1.bc1fe2p-15f), + (float2)(0x1.af2000p-3f, 0x1.c94e80p-15f), + (float2)(0x1.bc2000p-3f, 0x1.0ce85ap-16f), + (float2)(0x1.c8e000p-3f, 0x1.f7c79ap-15f), + (float2)(0x1.d5c000p-3f, 0x1.0b5a7cp-18f), + (float2)(0x1.e26000p-3f, 0x1.076e2ap-15f), + (float2)(0x1.ef0000p-3f, 0x1.5b97b8p-16f), + (float2)(0x1.fb8000p-3f, 0x1.186d5ep-15f), + (float2)(0x1.040000p-2f, 0x1.2ca5a6p-17f), + (float2)(0x1.0a2000p-2f, 0x1.24e272p-14f), + (float2)(0x1.104000p-2f, 0x1.8bf9aep-14f), + (float2)(0x1.166000p-2f, 0x1.5cabaap-14f), + (float2)(0x1.1c8000p-2f, 0x1.3182d2p-15f), + (float2)(0x1.228000p-2f, 0x1.41fbcep-14f), + (float2)(0x1.288000p-2f, 0x1.5a13dep-14f), + (float2)(0x1.2e8000p-2f, 0x1.c575c2p-15f), + (float2)(0x1.346000p-2f, 0x1.dd9a98p-14f), + (float2)(0x1.3a6000p-2f, 0x1.3155a4p-16f), + (float2)(0x1.404000p-2f, 0x1.843434p-17f), + (float2)(0x1.460000p-2f, 0x1.8bc21cp-14f), + (float2)(0x1.4be000p-2f, 0x1.7e55dcp-16f), + (float2)(0x1.51a000p-2f, 0x1.5b0e5ap-15f), + (float2)(0x1.576000p-2f, 0x1.dc5d14p-16f), + (float2)(0x1.5d0000p-2f, 0x1.bdbf58p-14f), + (float2)(0x1.62c000p-2f, 0x1.05e572p-15f), + (float2)(0x1.686000p-2f, 0x1.903d36p-15f), + (float2)(0x1.6e0000p-2f, 0x1.1d5456p-15f), + (float2)(0x1.738000p-2f, 0x1.d7f6bap-14f), + (float2)(0x1.792000p-2f, 0x1.4abfbap-15f), + (float2)(0x1.7ea000p-2f, 0x1.f07704p-15f), + (float2)(0x1.842000p-2f, 0x1.a3b43cp-15f), + (float2)(0x1.89a000p-2f, 0x1.9c360ap-17f), + (float2)(0x1.8f0000p-2f, 0x1.1e8736p-14f), + (float2)(0x1.946000p-2f, 0x1.941c20p-14f), + (float2)(0x1.99c000p-2f, 0x1.958116p-14f), + (float2)(0x1.9f2000p-2f, 0x1.23ecbep-14f), + (float2)(0x1.a48000p-2f, 0x1.024396p-16f), + (float2)(0x1.a9c000p-2f, 0x1.d93534p-15f), + (float2)(0x1.af0000p-2f, 0x1.293246p-14f), + (float2)(0x1.b44000p-2f, 0x1.eef798p-15f), + (float2)(0x1.b98000p-2f, 0x1.625a4cp-16f), + (float2)(0x1.bea000p-2f, 0x1.4d9da6p-14f), + (float2)(0x1.c3c000p-2f, 0x1.d7a7ccp-14f), + (float2)(0x1.c8e000p-2f, 0x1.f7c79ap-14f), + (float2)(0x1.ce0000p-2f, 0x1.af0b84p-14f), + (float2)(0x1.d32000p-2f, 0x1.fcfc00p-15f), + (float2)(0x1.d82000p-2f, 0x1.e7258ap-14f), + (float2)(0x1.dd4000p-2f, 0x1.a81306p-16f), + (float2)(0x1.e24000p-2f, 0x1.1034f8p-15f), + (float2)(0x1.e74000p-2f, 0x1.09875ap-16f), + (float2)(0x1.ec2000p-2f, 0x1.99d246p-14f), + (float2)(0x1.f12000p-2f, 0x1.1ebf5ep-15f), + (float2)(0x1.f60000p-2f, 0x1.23fa70p-14f), + (float2)(0x1.fae000p-2f, 0x1.588f78p-14f), + (float2)(0x1.ffc000p-2f, 0x1.2e0856p-14f), + (float2)(0x1.024000p-1f, 0x1.52a5a4p-13f), + (float2)(0x1.04a000p-1f, 0x1.df9da8p-13f), + (float2)(0x1.072000p-1f, 0x1.f2e0e6p-16f), + (float2)(0x1.098000p-1f, 0x1.bd3d5cp-15f), + (float2)(0x1.0be000p-1f, 0x1.cb9094p-15f), + (float2)(0x1.0e4000p-1f, 0x1.261746p-15f), + (float2)(0x1.108000p-1f, 0x1.f39e2cp-13f), + (float2)(0x1.12e000p-1f, 0x1.719592p-13f), + (float2)(0x1.154000p-1f, 0x1.87a5e8p-14f), + (float2)(0x1.178000p-1f, 0x1.eabbd8p-13f), + (float2)(0x1.19e000p-1f, 0x1.cd68cep-14f), + (float2)(0x1.1c2000p-1f, 0x1.b81f70p-13f), + (float2)(0x1.1e8000p-1f, 0x1.7d79c0p-15f), + (float2)(0x1.20c000p-1f, 0x1.b9a324p-14f), + (float2)(0x1.230000p-1f, 0x1.30d7bep-13f), + (float2)(0x1.254000p-1f, 0x1.5bce98p-13f), + (float2)(0x1.278000p-1f, 0x1.5e1288p-13f), + (float2)(0x1.29c000p-1f, 0x1.37fec2p-13f), + (float2)(0x1.2c0000p-1f, 0x1.d3da88p-14f), + (float2)(0x1.2e4000p-1f, 0x1.d0db90p-15f), + (float2)(0x1.306000p-1f, 0x1.d7334ep-13f), + (float2)(0x1.32a000p-1f, 0x1.133912p-13f), + (float2)(0x1.34e000p-1f, 0x1.44ece6p-16f), + (float2)(0x1.370000p-1f, 0x1.17b546p-13f), + (float2)(0x1.392000p-1f, 0x1.e0d356p-13f), + (float2)(0x1.3b6000p-1f, 0x1.0893fep-14f), + (float2)(0x1.3d8000p-1f, 0x1.026a70p-13f), + (float2)(0x1.3fa000p-1f, 0x1.5b84d0p-13f), + (float2)(0x1.41c000p-1f, 0x1.8fe846p-13f), + (float2)(0x1.43e000p-1f, 0x1.9fe2f8p-13f), + (float2)(0x1.460000p-1f, 0x1.8bc21cp-13f), + (float2)(0x1.482000p-1f, 0x1.53d1eap-13f), + (float2)(0x1.4a4000p-1f, 0x1.f0bb60p-14f), + (float2)(0x1.4c6000p-1f, 0x1.e6bf32p-15f), + (float2)(0x1.4e6000p-1f, 0x1.d811b6p-13f), + (float2)(0x1.508000p-1f, 0x1.13cc00p-13f), + (float2)(0x1.52a000p-1f, 0x1.6932dep-16f), + (float2)(0x1.54a000p-1f, 0x1.246798p-13f), + (float2)(0x1.56a000p-1f, 0x1.f9d5b2p-13f), + (float2)(0x1.58c000p-1f, 0x1.5b6b9ap-14f), + (float2)(0x1.5ac000p-1f, 0x1.404c34p-13f), + (float2)(0x1.5cc000p-1f, 0x1.b1dc6cp-13f), + (float2)(0x1.5ee000p-1f, 0x1.54920ap-20f), + (float2)(0x1.60e000p-1f, 0x1.97a23cp-16f), + (float2)(0x1.62e000p-1f, 0x1.0bfbe8p-15f), +) + +DECLARE_TABLE(float, LOG_INV_TBL, 129, + 0x1.000000p+1f, + 0x1.fc07f0p+0f, + 0x1.f81f82p+0f, + 0x1.f4465ap+0f, + 0x1.f07c20p+0f, + 0x1.ecc07cp+0f, + 0x1.e9131ap+0f, + 0x1.e573acp+0f, + 0x1.e1e1e2p+0f, + 0x1.de5d6ep+0f, + 0x1.dae608p+0f, + 0x1.d77b66p+0f, + 0x1.d41d42p+0f, + 0x1.d0cb58p+0f, + 0x1.cd8568p+0f, + 0x1.ca4b30p+0f, + 0x1.c71c72p+0f, + 0x1.c3f8f0p+0f, + 0x1.c0e070p+0f, + 0x1.bdd2b8p+0f, + 0x1.bacf92p+0f, + 0x1.b7d6c4p+0f, + 0x1.b4e81cp+0f, + 0x1.b20364p+0f, + 0x1.af286cp+0f, + 0x1.ac5702p+0f, + 0x1.a98ef6p+0f, + 0x1.a6d01ap+0f, + 0x1.a41a42p+0f, + 0x1.a16d40p+0f, + 0x1.9ec8eap+0f, + 0x1.9c2d14p+0f, + 0x1.99999ap+0f, + 0x1.970e50p+0f, + 0x1.948b10p+0f, + 0x1.920fb4p+0f, + 0x1.8f9c18p+0f, + 0x1.8d3018p+0f, + 0x1.8acb90p+0f, + 0x1.886e60p+0f, + 0x1.861862p+0f, + 0x1.83c978p+0f, + 0x1.818182p+0f, + 0x1.7f4060p+0f, + 0x1.7d05f4p+0f, + 0x1.7ad220p+0f, + 0x1.78a4c8p+0f, + 0x1.767dcep+0f, + 0x1.745d18p+0f, + 0x1.724288p+0f, + 0x1.702e06p+0f, + 0x1.6e1f76p+0f, + 0x1.6c16c2p+0f, + 0x1.6a13cep+0f, + 0x1.681682p+0f, + 0x1.661ec6p+0f, + 0x1.642c86p+0f, + 0x1.623fa8p+0f, + 0x1.605816p+0f, + 0x1.5e75bcp+0f, + 0x1.5c9882p+0f, + 0x1.5ac056p+0f, + 0x1.58ed24p+0f, + 0x1.571ed4p+0f, + 0x1.555556p+0f, + 0x1.539094p+0f, + 0x1.51d07ep+0f, + 0x1.501502p+0f, + 0x1.4e5e0ap+0f, + 0x1.4cab88p+0f, + 0x1.4afd6ap+0f, + 0x1.49539ep+0f, + 0x1.47ae14p+0f, + 0x1.460cbcp+0f, + 0x1.446f86p+0f, + 0x1.42d662p+0f, + 0x1.414142p+0f, + 0x1.3fb014p+0f, + 0x1.3e22ccp+0f, + 0x1.3c995ap+0f, + 0x1.3b13b2p+0f, + 0x1.3991c2p+0f, + 0x1.381382p+0f, + 0x1.3698e0p+0f, + 0x1.3521d0p+0f, + 0x1.33ae46p+0f, + 0x1.323e34p+0f, + 0x1.30d190p+0f, + 0x1.2f684cp+0f, + 0x1.2e025cp+0f, + 0x1.2c9fb4p+0f, + 0x1.2b404ap+0f, + 0x1.29e412p+0f, + 0x1.288b02p+0f, + 0x1.27350cp+0f, + 0x1.25e228p+0f, + 0x1.24924ap+0f, + 0x1.234568p+0f, + 0x1.21fb78p+0f, + 0x1.20b470p+0f, + 0x1.1f7048p+0f, + 0x1.1e2ef4p+0f, + 0x1.1cf06ap+0f, + 0x1.1bb4a4p+0f, + 0x1.1a7b96p+0f, + 0x1.194538p+0f, + 0x1.181182p+0f, + 0x1.16e068p+0f, + 0x1.15b1e6p+0f, + 0x1.1485f0p+0f, + 0x1.135c82p+0f, + 0x1.12358ep+0f, + 0x1.111112p+0f, + 0x1.0fef02p+0f, + 0x1.0ecf56p+0f, + 0x1.0db20ap+0f, + 0x1.0c9714p+0f, + 0x1.0b7e6ep+0f, + 0x1.0a6810p+0f, + 0x1.0953f4p+0f, + 0x1.084210p+0f, + 0x1.073260p+0f, + 0x1.0624dep+0f, + 0x1.051980p+0f, + 0x1.041042p+0f, + 0x1.03091cp+0f, + 0x1.020408p+0f, + 0x1.010102p+0f, + 0x1.000000p+0f, +) + +DECLARE_TABLE(float2, LOG_INV_TBL_EP, 129, + (float2)(0x1.000000p+1f, 0x0.000000p+0f), + (float2)(0x1.fc0000p+0f, 0x1.fc07f0p-14f), + (float2)(0x1.f80000p+0f, 0x1.f81f82p-12f), + (float2)(0x1.f40000p+0f, 0x1.196792p-10f), + (float2)(0x1.f00000p+0f, 0x1.f07c20p-10f), + (float2)(0x1.ec0000p+0f, 0x1.80f660p-9f), + (float2)(0x1.e80000p+0f, 0x1.131ac0p-8f), + (float2)(0x1.e40000p+0f, 0x1.73ac90p-8f), + (float2)(0x1.e00000p+0f, 0x1.e1e1e2p-8f), + (float2)(0x1.de0000p+0f, 0x1.75b8fep-10f), + (float2)(0x1.da0000p+0f, 0x1.cc0ed8p-9f), + (float2)(0x1.d60000p+0f, 0x1.7b654cp-8f), + (float2)(0x1.d40000p+0f, 0x1.d41d42p-12f), + (float2)(0x1.d00000p+0f, 0x1.96b1eep-9f), + (float2)(0x1.cc0000p+0f, 0x1.856890p-8f), + (float2)(0x1.ca0000p+0f, 0x1.2cc158p-10f), + (float2)(0x1.c60000p+0f, 0x1.1c71c8p-8f), + (float2)(0x1.c20000p+0f, 0x1.f8f01cp-8f), + (float2)(0x1.c00000p+0f, 0x1.c0e070p-9f), + (float2)(0x1.bc0000p+0f, 0x1.d2b89ap-8f), + (float2)(0x1.ba0000p+0f, 0x1.9f2298p-9f), + (float2)(0x1.b60000p+0f, 0x1.d6c3dep-8f), + (float2)(0x1.b40000p+0f, 0x1.d0369ep-9f), + (float2)(0x1.b20000p+0f, 0x1.b20364p-15f), + (float2)(0x1.ae0000p+0f, 0x1.286bcap-8f), + (float2)(0x1.ac0000p+0f, 0x1.5c06b2p-10f), + (float2)(0x1.a80000p+0f, 0x1.8ef606p-8f), + (float2)(0x1.a60000p+0f, 0x1.a034dap-9f), + (float2)(0x1.a40000p+0f, 0x1.a41a42p-12f), + (float2)(0x1.a00000p+0f, 0x1.6d3f98p-8f), + (float2)(0x1.9e0000p+0f, 0x1.91d2a2p-9f), + (float2)(0x1.9c0000p+0f, 0x1.68a772p-11f), + (float2)(0x1.980000p+0f, 0x1.99999ap-8f), + (float2)(0x1.960000p+0f, 0x1.0e4f80p-8f), + (float2)(0x1.940000p+0f, 0x1.161f9ap-9f), + (float2)(0x1.920000p+0f, 0x1.f693a2p-13f), + (float2)(0x1.8e0000p+0f, 0x1.9c18fap-8f), + (float2)(0x1.8c0000p+0f, 0x1.3018d4p-8f), + (float2)(0x1.8a0000p+0f, 0x1.9721eep-9f), + (float2)(0x1.880000p+0f, 0x1.b97c2ap-10f), + (float2)(0x1.860000p+0f, 0x1.861862p-12f), + (float2)(0x1.820000p+0f, 0x1.c977acp-8f), + (float2)(0x1.800000p+0f, 0x1.818182p-8f), + (float2)(0x1.7e0000p+0f, 0x1.405fd0p-8f), + (float2)(0x1.7c0000p+0f, 0x1.05f418p-8f), + (float2)(0x1.7a0000p+0f, 0x1.a4411cp-9f), + (float2)(0x1.780000p+0f, 0x1.499030p-9f), + (float2)(0x1.760000p+0f, 0x1.f7390ep-10f), + (float2)(0x1.740000p+0f, 0x1.745d18p-10f), + (float2)(0x1.720000p+0f, 0x1.0a1fd2p-10f), + (float2)(0x1.700000p+0f, 0x1.702e06p-11f), + (float2)(0x1.6e0000p+0f, 0x1.f76b44p-12f), + (float2)(0x1.6c0000p+0f, 0x1.6c16c2p-12f), + (float2)(0x1.6a0000p+0f, 0x1.3cd154p-12f), + (float2)(0x1.680000p+0f, 0x1.681682p-12f), + (float2)(0x1.660000p+0f, 0x1.ec6a52p-12f), + (float2)(0x1.640000p+0f, 0x1.642c86p-11f), + (float2)(0x1.620000p+0f, 0x1.fd3b80p-11f), + (float2)(0x1.600000p+0f, 0x1.605816p-10f), + (float2)(0x1.5e0000p+0f, 0x1.d6ee34p-10f), + (float2)(0x1.5c0000p+0f, 0x1.310572p-9f), + (float2)(0x1.5a0000p+0f, 0x1.80ad60p-9f), + (float2)(0x1.580000p+0f, 0x1.da4610p-9f), + (float2)(0x1.560000p+0f, 0x1.1ed3c6p-8f), + (float2)(0x1.540000p+0f, 0x1.555556p-8f), + (float2)(0x1.520000p+0f, 0x1.909490p-8f), + (float2)(0x1.500000p+0f, 0x1.d07eaep-8f), + (float2)(0x1.500000p+0f, 0x1.501502p-12f), + (float2)(0x1.4e0000p+0f, 0x1.7829ccp-10f), + (float2)(0x1.4c0000p+0f, 0x1.5710e4p-9f), + (float2)(0x1.4a0000p+0f, 0x1.fad40ap-9f), + (float2)(0x1.480000p+0f, 0x1.539e3cp-8f), + (float2)(0x1.460000p+0f, 0x1.ae147ap-8f), + (float2)(0x1.460000p+0f, 0x1.978fecp-13f), + (float2)(0x1.440000p+0f, 0x1.be1958p-10f), + (float2)(0x1.420000p+0f, 0x1.acc4bap-9f), + (float2)(0x1.400000p+0f, 0x1.414142p-8f), + (float2)(0x1.3e0000p+0f, 0x1.b013fcp-8f), + (float2)(0x1.3e0000p+0f, 0x1.165e72p-11f), + (float2)(0x1.3c0000p+0f, 0x1.32b490p-9f), + (float2)(0x1.3a0000p+0f, 0x1.13b13cp-8f), + (float2)(0x1.380000p+0f, 0x1.91c2c2p-8f), + (float2)(0x1.380000p+0f, 0x1.381382p-12f), + (float2)(0x1.360000p+0f, 0x1.31be7cp-9f), + (float2)(0x1.340000p+0f, 0x1.21cfb2p-8f), + (float2)(0x1.320000p+0f, 0x1.ae45b6p-8f), + (float2)(0x1.320000p+0f, 0x1.f1a516p-11f), + (float2)(0x1.300000p+0f, 0x1.a32026p-9f), + (float2)(0x1.2e0000p+0f, 0x1.684bdap-8f), + (float2)(0x1.2e0000p+0f, 0x1.2e025cp-15f), + (float2)(0x1.2c0000p+0f, 0x1.3f69b0p-9f), + (float2)(0x1.2a0000p+0f, 0x1.404ad0p-8f), + (float2)(0x1.280000p+0f, 0x1.e4129ep-8f), + (float2)(0x1.280000p+0f, 0x1.160252p-9f), + (float2)(0x1.260000p+0f, 0x1.350b88p-8f), + (float2)(0x1.240000p+0f, 0x1.e22708p-8f), + (float2)(0x1.240000p+0f, 0x1.24924ap-9f), + (float2)(0x1.220000p+0f, 0x1.45678ap-8f), + (float2)(0x1.200000p+0f, 0x1.fb7812p-8f), + (float2)(0x1.200000p+0f, 0x1.68e18cp-9f), + (float2)(0x1.1e0000p+0f, 0x1.7047dcp-8f), + (float2)(0x1.1e0000p+0f, 0x1.779da0p-11f), + (float2)(0x1.1c0000p+0f, 0x1.e0d5b4p-9f), + (float2)(0x1.1a0000p+0f, 0x1.b4a404p-8f), + (float2)(0x1.1a0000p+0f, 0x1.ee5846p-10f), + (float2)(0x1.180000p+0f, 0x1.453808p-8f), + (float2)(0x1.180000p+0f, 0x1.181182p-12f), + (float2)(0x1.160000p+0f, 0x1.c0d128p-9f), + (float2)(0x1.140000p+0f, 0x1.b1e5f8p-8f), + (float2)(0x1.140000p+0f, 0x1.0be1c2p-9f), + (float2)(0x1.120000p+0f, 0x1.5c8114p-8f), + (float2)(0x1.120000p+0f, 0x1.ac73aep-11f), + (float2)(0x1.100000p+0f, 0x1.111112p-8f), + (float2)(0x1.0e0000p+0f, 0x1.ef0110p-8f), + (float2)(0x1.0e0000p+0f, 0x1.9ead7cp-9f), + (float2)(0x1.0c0000p+0f, 0x1.b20a88p-8f), + (float2)(0x1.0c0000p+0f, 0x1.2e29f8p-9f), + (float2)(0x1.0a0000p+0f, 0x1.7e6ec2p-8f), + (float2)(0x1.0a0000p+0f, 0x1.a0429ap-10f), + (float2)(0x1.080000p+0f, 0x1.53f390p-8f), + (float2)(0x1.080000p+0f, 0x1.084210p-10f), + (float2)(0x1.060000p+0f, 0x1.3260a4p-8f), + (float2)(0x1.060000p+0f, 0x1.26e978p-11f), + (float2)(0x1.040000p+0f, 0x1.197f7ep-8f), + (float2)(0x1.040000p+0f, 0x1.041042p-12f), + (float2)(0x1.020000p+0f, 0x1.091b52p-8f), + (float2)(0x1.020000p+0f, 0x1.020408p-14f), + (float2)(0x1.000000p+0f, 0x1.010102p-8f), + (float2)(0x1.000000p+0f, 0x0.000000p+0f), +) +
diff --git a/amd-builtins/math32/logbF.cl b/amd-builtins/math32/logbF.cl new file mode 100644 index 0000000..ba18634 --- /dev/null +++ b/amd-builtins/math32/logbF.cl
@@ -0,0 +1,35 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +logb(float x) +{ + int ax = as_int(x) & EXSIGNBIT_SP32; + float s = -118 - clz(ax); + float r = (ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + r = ax >= PINFBITPATT_SP32 ? as_float(ax) : r; + r = ax < 0x00800000 ? s : r; + r = ax == 0 ? as_float(NINFBITPATT_SP32) : r; + return r; +}
diff --git a/amd-builtins/math32/madF.cl b/amd-builtins/math32/madF.cl new file mode 100644 index 0000000..47c6736 --- /dev/null +++ b/amd-builtins/math32/madF.cl
@@ -0,0 +1,29 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +mad(float x, float y, float z) +{ + return __amdil_mad_f32(x, y, z); +}
diff --git a/amd-builtins/math32/math32.h b/amd-builtins/math32/math32.h new file mode 100644 index 0000000..5d98944 --- /dev/null +++ b/amd-builtins/math32/math32.h
@@ -0,0 +1,119 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef MATH32_H +#define MATH32_H 1 + +extern __attribute__((pure)) float __amdil_copysign_f32(float, float); +extern __attribute__((pure)) float __amdil_fma_f32(float, float, float); +extern __attribute__((pure)) float __amdil_mad_f32(float, float, float); +extern __attribute__((pure)) float __amdil_min_f32(float, float); +extern __attribute__((pure)) float __amdil_max_f32(float, float); +extern __attribute__((pure)) float __ftz_f32(float); +extern __attribute__((pure)) float __amdil_round_nearest_f32(float); +extern __attribute__((pure)) float __amdil_round_neginf_f32(float); +extern __attribute__((pure)) float __amdil_round_posinf_f32(float); +extern __attribute__((pure)) float __amdil_round_zero_f32(float); +extern __attribute__((pure)) float __amdil_fabs_f32(float); +extern __attribute__((pure)) float __amdil_improved_div_f32(float, float); +extern __attribute__((pure)) float __amdil_fraction_f32(float); +extern __attribute__((pure)) uint __amdil_cmov_logical_i32(uint, uint, uint); +extern __attribute__((pure)) uint __amdil_is_asic_id_i32(uint); +extern __attribute__((pure)) uint __amdil_is_constant_f32(float); + +#define SNAN 0x001 +#define QNAN 0x002 +#define NINF 0x004 +#define NNOR 0x008 +#define NSUB 0x010 +#define NZER 0x020 +#define PZER 0x040 +#define PSUB 0x080 +#define PNOR 0x100 +#define PINF 0x200 + +extern __attribute__((pure)) int __amdil_class_f32(float, int); + +// HSA definitions for these macros +#define HAVE_HW_FMA32() (1) +#define HAVE_BITALIGN() (0) +#define HAVE_FAST_FMA32() (0) + + +// Allow control over how division is done +#define MATH_DIVIDE(X, Y) native_divide(X, Y) +// #define MATH_DIVIDE(X,Y) ((X) / (Y)) +#define MATH_RECIP(X) native_recip(X) +// #define MATH_RECIP(X) (1.0f / (X)) + +// Allow control over square root +#define MATH_SQRT(X) native_sqrt(X) + +// Force a flush of a subnormal to zero by feeding it through a functional unit +#define FTZ(X) __ftz_f32(X) + +// Table stuff +#define TABLE_SPACE __constant + +#define TABLE_MANGLE(NAME) __math32_##NAME + +#define USE_TABLE(TYPE,PTR,NAME) \ + extern TABLE_SPACE TYPE TABLE_MANGLE(NAME) []; \ + TABLE_SPACE TYPE * PTR = TABLE_MANGLE(NAME) + +#define DECLARE_TABLE(TYPE,NAME,LENGTH,...) \ + TABLE_SPACE TYPE TABLE_MANGLE(NAME) [ LENGTH ] = { __VA_ARGS__ }; + +/* These definitions, used by float functions, + are for both 32 and 64 bit machines */ +#define SIGNBIT_SP32 0x80000000 +#define EXSIGNBIT_SP32 0x7fffffff +#define EXPBITS_SP32 0x7f800000 +#define MANTBITS_SP32 0x007fffff +#define ONEEXPBITS_SP32 0x3f800000 +#define TWOEXPBITS_SP32 0x40000000 +#define HALFEXPBITS_SP32 0x3f000000 +#define IMPBIT_SP32 0x00800000 +#define QNANBITPATT_SP32 0x7fc00000 +#define INDEFBITPATT_SP32 0xffc00000 +#define PINFBITPATT_SP32 0x7f800000 +#define NINFBITPATT_SP32 0xff800000 +#define EXPBIAS_SP32 127 +#define EXPSHIFTBITS_SP32 23 +#define BIASEDEMIN_SP32 1 +#define EMIN_SP32 -126 +#define BIASEDEMAX_SP32 254 +#define EMAX_SP32 127 +#define LAMBDA_SP32 1.0e30 +#define MANTLENGTH_SP32 24 +#define BASEDIGITS_SP32 7 + +#define ALIGNED(x) __attribute__((aligned(x))) + +// Workaround a bug in the Apple linker that prevents inlining of large, +// frequently-used static functions that only have the inline attribute. +// Force all inline functions to be always_inlined. +#ifdef USE_APPLE +#define inline __attribute__((always_inline)) +#endif + +#endif /* MATH32_H */ +
diff --git a/amd-builtins/math32/maxmagF.cl b/amd-builtins/math32/maxmagF.cl new file mode 100644 index 0000000..44015d9 --- /dev/null +++ b/amd-builtins/math32/maxmagF.cl
@@ -0,0 +1,38 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +maxmag(float x, float y) +{ + int ix = as_int(x); + int iy = as_int(y); + int ax = ix & 0x7fffffff; + int ay = iy & 0x7fffffff; + ax |= -(ax > 0x7f800000); + ay |= -(ay > 0x7f800000); + return as_float((-(ax > ay) & ix) | + (-(ay > ax) & iy) | + (-(ax == ay) & ((ix & iy) | (ax & 0x00400000)))); +} +
diff --git a/amd-builtins/math32/minmagF.cl b/amd-builtins/math32/minmagF.cl new file mode 100644 index 0000000..73f9d86 --- /dev/null +++ b/amd-builtins/math32/minmagF.cl
@@ -0,0 +1,36 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +minmag(float x, float y) +{ + int ix = as_int(x); + int iy = as_int(y); + int ax = ix & 0x7fffffff; + int ay = iy & 0x7fffffff; + return as_float((-(ax < ay) & ix) | + (-(ay < ax) & iy) | + (-(ax == ay) & (ix | iy))); +} +
diff --git a/amd-builtins/math32/modfF.cl b/amd-builtins/math32/modfF.cl new file mode 100644 index 0000000..5369c0b --- /dev/null +++ b/amd-builtins/math32/modfF.cl
@@ -0,0 +1,67 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +modf(float x, float *iptr) +{ + int ux = as_int(x); + int e = ((ux >> 23) & 0xff) - 127; + int s = ux & 0x80000000; + int msk = 0xffffffff << (23 - e); + int i = msk & ux; + int r = as_uint(x - as_float(i)) | s; + + r = e < 0 ? ux : r; + i = e < 0 ? s : i; + + r = e >= 23 ? s : r; + i = e >= 23 ? ux : i; + + r = (ux & 0x7fffffff) > 0x7f800000 ? ux : r; + + *iptr = as_float(i); + return as_float(r); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline)) float +modf(float x, __global float *iptr) +{ + float i; + float f = modf(x, &i); + *iptr = i; + return f; +} + +__attribute__((overloadable, always_inline)) float +modf(float x, __local float *iptr) +{ + float i; + float f = modf(x, &i); + *iptr = i; + return f; +} +#endif +
diff --git a/amd-builtins/math32/nanF.cl b/amd-builtins/math32/nanF.cl new file mode 100644 index 0000000..33cda39 --- /dev/null +++ b/amd-builtins/math32/nanF.cl
@@ -0,0 +1,28 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, always_inline)) float +nan(uint nancode) +{ + return as_float((nancode & 0xfffff) | 0x7fc00000); +} +
diff --git a/amd-builtins/math32/nextafterF.cl b/amd-builtins/math32/nextafterF.cl new file mode 100644 index 0000000..e2e75af --- /dev/null +++ b/amd-builtins/math32/nextafterF.cl
@@ -0,0 +1,43 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +nextafter(float x, float y) +{ + int ix = as_int(x); + int ax = ix & 0x7fffffff; + int mx = 0x80000000 - ix; + mx = ix < 0 ? mx : ix; + int iy = as_int(y); + int ay = iy & 0x7fffffff; + int my = 0x80000000 - iy; + my = iy < 0 ? my : iy; + int t = mx + (mx < my ? 1 : -1); + int r = 0x80000000 - t; + r = t < 0 ? r : t; + r = ax > 0x7f800000 ? ix : r; + r = ay > 0x7f800000 ? iy : r; + r = (ax|ay) == 0 | ix == iy ? iy : r; + return as_float(r); +}
diff --git a/amd-builtins/math32/pdivF.cl b/amd-builtins/math32/pdivF.cl new file mode 100644 index 0000000..83c3d1f --- /dev/null +++ b/amd-builtins/math32/pdivF.cl
@@ -0,0 +1,79 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +extern __attribute__((pure)) float __hsail_div_f32(float, float); + +__attribute__((always_inline, weak)) float +__precise_fp32_div_f32(float x, float y) +{ + return __hsail_div_f32(x,y); +} + + +__attribute__((always_inline, weak)) float2 +__precise_fp32_div_2f32(float2 x, float2 y) +{ + float2 ret; + ret.lo = __precise_fp32_div_f32(x.lo, y.lo); + ret.hi = __precise_fp32_div_f32(x.hi, y.hi); + return ret; +} + +__attribute__((always_inline, weak)) float3 +__precise_fp32_div_3f32(float3 x, float3 y) +{ + float3 ret; + ret.xy = __precise_fp32_div_2f32(x.xy, y.xy); + ret.z = __precise_fp32_div_f32(x.z, y.z); + return ret; +} + +__attribute__((always_inline, weak)) float4 +__precise_fp32_div_4f32(float4 x, float4 y) +{ + float4 ret; + ret.lo = __precise_fp32_div_2f32(x.lo, y.lo); + ret.hi = __precise_fp32_div_2f32(x.hi, y.hi); + return ret; +} + +__attribute__((always_inline, weak)) float8 +__precise_fp32_div_8f32(float8 x, float8 y) +{ + float8 ret; + ret.lo = __precise_fp32_div_4f32(x.lo, y.lo); + ret.hi = __precise_fp32_div_4f32(x.hi, y.hi); + return ret; +} + +__attribute__((always_inline, weak)) float16 +__precise_fp32_div_16f32(float16 x, float16 y) +{ + float16 ret; + ret.s0123 = __precise_fp32_div_4f32(x.s0123, y.s0123); + ret.s4567 = __precise_fp32_div_4f32(x.s4567, y.s4567); + ret.s89ab = __precise_fp32_div_4f32(x.s89ab, y.s89ab); + ret.scdef = __precise_fp32_div_4f32(x.scdef, y.scdef); + return ret; +}
diff --git a/amd-builtins/math32/powF.cl b/amd-builtins/math32/powF.cl new file mode 100644 index 0000000..de0710c --- /dev/null +++ b/amd-builtins/math32/powF.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_POW +#include "powF_base.h" +
diff --git a/amd-builtins/math32/powF_base.h b/amd-builtins/math32/powF_base.h new file mode 100644 index 0000000..e15ed5f --- /dev/null +++ b/amd-builtins/math32/powF_base.h
@@ -0,0 +1,308 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +// compute pow using log and exp +// x^y = exp(y * log(x)) +// +// we take care not to lose precision in the intermediate steps +// +// When computing log, calculate it in splits, +// +// r = f * (p_invead + p_inv_tail) +// r = rh + rt +// +// calculate log polynomial using r, in end addition, do +// poly = poly + ((rh-r) + rt) +// +// lth = -r +// ltt = ((xexp * log2_t) - poly) + logT +// lt = lth + ltt +// +// lh = (xexp * log2_h) + logH +// l = lh + lt +// +// Calculate final log answer as gh and gt, +// gh = l & higher-half bits +// gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh)) +// +// yh = y & higher-half bits +// yt = y - yh +// +// Before entering computation of exp, +// vs = ((yt*gt + yt*gh) + yh*gt) +// v = vs + yh*gh +// vt = ((yh*gh - v) + vs) +// +// In calculation of exp, add vt to r that is used for poly +// At the end of exp, do +// ((((expT * poly) + expT) + expH*poly) + expH) + +__attribute__((overloadable)) float +#if defined(COMPILING_POWR) +powr(float x, float y) +#elif defined(COMPILING_POWN) +pown(float x, int ny) +#elif defined(COMPILING_ROOTN) +rootn(float x, int ny) +#else +pow(float x, float y) +#endif +{ + USE_TABLE(float2, p_log, LOGE_TBL); + USE_TABLE(float2, p_inv, LOG_INV_TBL_EP); + USE_TABLE(float2, p_jby64, EXP_TBL_EP); + +#if defined(COMPILING_POWN) + float y = (float)ny; +#elif defined(COMPILING_ROOTN) + float y = MATH_RECIP((float)ny); +#endif + + int ix = as_int(x); + int ax = ix & EXSIGNBIT_SP32; + int xpos = ix == ax; + + int iy = as_int(y); + int ay = iy & EXSIGNBIT_SP32; + int ypos = iy == ay; + + // Extra precise log calculation + // First handle case that x is close to 1 + float r = 1.0f - as_float(ax); + int near1 = fabs(r) < 0x1.0p-4f; + float r2 = r*r; + + // Coefficients are just 1/3, 1/4, 1/5 and 1/6 + float poly = mad(r, + mad(r, + mad(r, + mad(r, 0x1.24924ap-3f, 0x1.555556p-3f), + 0x1.99999ap-3f), + 0x1.000000p-2f), + 0x1.555556p-2f); + + poly *= r2*r; + + float lth_near1 = -r2 * 0.5f; + float ltt_near1 = -poly; + float lt_near1 = lth_near1 + ltt_near1; + float lh_near1 = -r; + float l_near1 = lh_near1 + lt_near1; + + // Computations for x not near 1 + int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + float mf = (float)m; + int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f); + float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253); + int c = m == -127; + int ixn = c ? ixs : ax; + float mfn = c ? mfs : mf; + + int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1); + + // F - Y + float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32)); + + indx = indx >> 16; + float2 tv = p_inv[indx]; + float rh = f * tv.s0; + float rt = f * tv.s1; + r = rh + rt; + + poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r); + poly += (rh - r) + rt; + + const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234 + const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 + tv = p_log[indx]; + float lth = -r; + float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1; + float lt = lth + ltt; + float lh = mad(mfn, LOG2_HEAD, tv.s0); + float l = lh + lt; + + // Select near 1 or not + lth = near1 ? lth_near1 : lth; + ltt = near1 ? ltt_near1 : ltt; + lt = near1 ? lt_near1 : lt; + lh = near1 ? lh_near1 : lh; + l = near1 ? l_near1 : l; + + float gh = as_float(as_int(l) & 0xfffff000); + float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh); + + float yh = as_float(iy & 0xfffff000); + +#if defined(COMPILING_POWN) + float yt = (float)(ny - (int)yh); +#elif defined(COMPILING_ROOTN) + float fny = (float)ny; + float fnyh = as_float(as_int(fny) & 0xfffff000); + float fnyt = (float)(ny - (int)fnyh); + float yt = MATH_DIVIDE(mad(-fnyt, yh, mad(-fnyh, yh, 1.0f)), fny); +#else + float yt = y - yh; +#endif + + float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt)); + float ylogx = mad(yh, gh, ylogx_s); + float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s; + + // Extra precise exp of ylogx + const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657 + int n = convert_int(ylogx * R_64_BY_LOG2); + float nf = (float) n; + + int j = n & 0x3f; + m = n >> 6; + int m2 = m << EXPSHIFTBITS_SP32; + + const float R_LOG2_BY_64_LD = 0x1.620000p-7f; // log2/64 lead: 0.0108032227 + const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388 + r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t; + + // Truncated Taylor series for e^r + poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r); + + tv = p_jby64[j]; + + float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0; + #if !defined(SUBNORMALS_SUPPORTED) + int explg = ((as_uint(expylogx) & EXPBITS_SP32 >> 23) - 127); + m = (23-(m + 149)) == 0 ? 1: m; + uint mantissa = ((as_uint(expylogx) & MANTBITS_SP32)|IMPBIT_SP32) >> (23-(m + 149)); + float sexpylogx = as_float(mantissa); + #else + float sexpylogx = expylogx * as_float(0x1 << (m + 149)); + #endif + + + float texpylogx = as_float(as_int(expylogx) + m2); + expylogx = m < -125 ? sexpylogx : texpylogx; + + // Result is +-Inf if (ylogx + ylogx_t) > 128*log2 + expylogx = ylogx > 0x1.62e430p+6f | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f) ? as_float(PINFBITPATT_SP32) : expylogx; + + // Result is 0 if ylogx < -149*log2 + expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx; + + // Classify y: + // inty = 0 means not an integer. + // inty = 1 means odd integer. + // inty = 2 means even integer. + +#if defined(COMPILING_POWN) || defined(COMPILING_ROOTN) + int inty = 2 - (ny & 1); +#else + int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1; + int mask = (1 << (24 - yexp)) - 1; + int yodd = ((iy >> (24 - yexp)) & 0x1) != 0; + int inty = yodd ? 1 : 2; + inty = (iy & mask) != 0 ? 0 : inty; + inty = yexp < 1 ? 0 : inty; + inty = yexp > 24 ? 2 : inty; +#endif + + float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32)); + expylogx = (inty == 1 & !xpos) ? signval : expylogx; + int ret = as_int(expylogx); + + // Corner case handling + +#if defined COMPILING_POWR + ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret; + ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret; + ret = ax == 0x3f800000 & ay < PINFBITPATT_SP32 ? 0x3f800000 : ret; + ret = ax == 0x3f800000 & ay == PINFBITPATT_SP32 ? QNANBITPATT_SP32 : ret; + ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret; + ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret; + ret = ix < PINFBITPATT_SP32 & ay == 0 ? 0x3f800000 : ret; + ret = ax == PINFBITPATT_SP32 & !ypos ? 0 : ret; + ret = ax == PINFBITPATT_SP32 & ypos ? PINFBITPATT_SP32 : ret; + ret = ax == PINFBITPATT_SP32 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret; + ret = ax == PINFBITPATT_SP32 & ay == 0 ? QNANBITPATT_SP32 : ret; + ret = ax == 0 & !ypos ? PINFBITPATT_SP32 : ret; + ret = ax == 0 & ypos ? 0 : ret; + ret = ax == 0 & ay == 0 ? QNANBITPATT_SP32 : ret; + ret = ax != 0 & !xpos ? QNANBITPATT_SP32 : ret; + ret = ax > PINFBITPATT_SP32 ? ix : ret; + ret = ay > PINFBITPATT_SP32 ? iy : ret; +#elif defined COMPILING_POWN + int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32; + ret = ax == 0 & !ypos & inty == 1 ? xinf : ret; + ret = ax == 0 & !ypos & inty == 2 ? PINFBITPATT_SP32 : ret; + ret = ax == 0 & ypos & inty == 2 ? 0 : ret; + int xzero = !xpos ? 0x80000000 : 0L; + ret = ax == 0 & ypos & inty == 1 ? xzero : ret; + ret = ix == NINFBITPATT_SP32 & !ypos & inty == 1 ? 0x80000000 : ret; + ret = ix == NINFBITPATT_SP32 & !ypos & inty != 1 ? 0 : ret; + ret = ix == NINFBITPATT_SP32 & ypos & inty == 1 ? NINFBITPATT_SP32 : ret; + ret = ix == NINFBITPATT_SP32 & ypos & inty != 1 ? PINFBITPATT_SP32 : ret; + ret = ix == PINFBITPATT_SP32 & !ypos ? 0 : ret; + ret = ix == PINFBITPATT_SP32 & ypos ? PINFBITPATT_SP32 : ret; + ret = ax > PINFBITPATT_SP32 ? ix : ret; + ret = ny == 0 ? 0x3f800000 : ret; +#elif defined COMPILING_ROOTN + ret = !xpos & inty == 2 ? QNANBITPATT_SP32 : ret; + int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32; + ret = ax == 0 & !ypos & inty == 1 ? xinf : ret; + ret = ax == 0 & !ypos & inty == 2 ? PINFBITPATT_SP32 : ret; + ret = ax == 0 & ypos & inty == 2 ? 0 : ret; + int xzero = xpos ? 0 : 0x80000000; + ret = ax == 0 & ypos & inty == 1 ? xzero : ret; + ret = ix == NINFBITPATT_SP32 & ypos & inty == 1 ? NINFBITPATT_SP32 : ret; + ret = ix == NINFBITPATT_SP32 & !ypos & inty == 1 ? 0x80000000 : ret; + ret = ix == PINFBITPATT_SP32 & !ypos ? 0 : ret; + ret = ix == PINFBITPATT_SP32 & ypos ? PINFBITPATT_SP32 : ret; + ret = ax > PINFBITPATT_SP32 ? ix : ret; + ret = ny == 0 ? QNANBITPATT_SP32 : ret; +#else + ret = !xpos & inty == 0 ? QNANBITPATT_SP32 : ret; + ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret; + ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret; + ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret; + ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret; + int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32; + ret = ax == 0 & !ypos & inty == 1 ? xinf : ret; + ret = ax == 0 & !ypos & inty != 1 ? PINFBITPATT_SP32 : ret; + int xzero = xpos ? 0 : 0x80000000; + ret = ax == 0 & ypos & inty == 1 ? xzero : ret; + ret = ax == 0 & ypos & inty != 1 ? 0 : ret; + ret = ax == 0 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret; + ret = ix == 0xbf800000 & ay == PINFBITPATT_SP32 ? 0x3f800000 : ret; + ret = ix == NINFBITPATT_SP32 & !ypos & inty == 1 ? 0x80000000 : ret; + ret = ix == NINFBITPATT_SP32 & !ypos & inty != 1 ? 0 : ret; + ret = ix == NINFBITPATT_SP32 & ypos & inty == 1 ? NINFBITPATT_SP32 : ret; + ret = ix == NINFBITPATT_SP32 & ypos & inty != 1 ? PINFBITPATT_SP32 : ret; + ret = ix == PINFBITPATT_SP32 & !ypos ? 0 : ret; + ret = ix == PINFBITPATT_SP32 & ypos ? PINFBITPATT_SP32 : ret; + ret = ax > PINFBITPATT_SP32 ? ix : ret; + ret = ay > PINFBITPATT_SP32 ? iy : ret; + ret = ay == 0 ? 0x3f800000 : ret; + ret = ix == 0x3f800000 ? 0x3f800000 : ret; +#endif + + return as_float(ret); +} +
diff --git a/amd-builtins/math32/pownF.cl b/amd-builtins/math32/pownF.cl new file mode 100644 index 0000000..f454464 --- /dev/null +++ b/amd-builtins/math32/pownF.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_POWN +#include "powF_base.h" +
diff --git a/amd-builtins/math32/powrF.cl b/amd-builtins/math32/powrF.cl new file mode 100644 index 0000000..7bd5743 --- /dev/null +++ b/amd-builtins/math32/powrF.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_POWR +#include "powF_base.h" +
diff --git a/amd-builtins/math32/psqrtF.cl b/amd-builtins/math32/psqrtF.cl new file mode 100644 index 0000000..0716fbc --- /dev/null +++ b/amd-builtins/math32/psqrtF.cl
@@ -0,0 +1,79 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +extern __attribute__((pure)) float __hsail_sqrt_ftz_f32(float); + +__attribute__((always_inline, weak)) float +__precise_fp32_sqrt_f32(float x) +{ + return __hsail_sqrt_ftz_f32(x); +} + + +__attribute__((always_inline, weak)) float2 +__precise_fp32_sqrt_2f32(float2 x) +{ + float2 ret; + ret.lo = __precise_fp32_sqrt_f32(x.lo); + ret.hi = __precise_fp32_sqrt_f32(x.hi); + return ret; +} + +__attribute__((always_inline, weak)) float3 +__precise_fp32_sqrt_3f32(float3 x) +{ + float3 ret; + ret.xy = __precise_fp32_sqrt_2f32(x.xy); + ret.z = __precise_fp32_sqrt_f32(x.z); + return ret; +} + +__attribute__((always_inline, weak)) float4 +__precise_fp32_sqrt_4f32(float4 x) +{ + float4 ret; + ret.lo = __precise_fp32_sqrt_2f32(x.lo); + ret.hi = __precise_fp32_sqrt_2f32(x.hi); + return ret; +} + +__attribute__((always_inline, weak)) float8 +__precise_fp32_sqrt_8f32(float8 x) +{ + float8 ret; + ret.lo = __precise_fp32_sqrt_4f32(x.lo); + ret.hi = __precise_fp32_sqrt_4f32(x.hi); + return ret; +} + +__attribute__((always_inline, weak)) float16 +__precise_fp32_sqrt_16f32(float16 x) +{ + float16 ret; + ret.s0123 = __precise_fp32_sqrt_4f32(x.s0123); + ret.s4567 = __precise_fp32_sqrt_4f32(x.s4567); + ret.s89ab = __precise_fp32_sqrt_4f32(x.s89ab); + ret.scdef = __precise_fp32_sqrt_4f32(x.scdef); + return ret; +}
diff --git a/amd-builtins/math32/remainderF.cl b/amd-builtins/math32/remainderF.cl new file mode 100644 index 0000000..5339704 --- /dev/null +++ b/amd-builtins/math32/remainderF.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_REMAINDER +#include "remainderF.h" +
diff --git a/amd-builtins/math32/remainderF.h b/amd-builtins/math32/remainderF.h new file mode 100644 index 0000000..088a10f --- /dev/null +++ b/amd-builtins/math32/remainderF.h
@@ -0,0 +1,451 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +#if !defined(SUBNORMALS_SUPPORTED) +static inline float +scaleFullRangef32(float y, float t) +{ + float ay, ty, r = 0; + int k, iiy, iy, exp_iy0, exp_iy, manty, signy, miy; + int delta, shift, ir; + + ay = fabs(t); + k = ay > 1024 ? 1024 : (int) ay; + k = t < 0 ? -k : k; + t = (float) k; + + iiy = as_int(y); + iy = iiy & EXSIGNBIT_SP32; + signy = iiy & SIGNBIT_SP32; + ay = as_float(iy); + + exp_iy0 = iy & EXPBITS_SP32; + manty = iy & MANTBITS_SP32; + + //sub-normal + ty = exp_iy0 == 0 ? (float) manty : as_float(iy); + k = exp_iy0 == 0 ? k - 149 : k; + ay = ty; + iy = as_int(ay); + exp_iy0 = iy & EXPBITS_SP32; + exp_iy = (exp_iy0 >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + // add k to y's exponent + r = as_float(iy + (k << EXPSHIFTBITS_SP32)); + r = (exp_iy + k) > 127 ? as_float(PINFBITPATT_SP32) : r; + // add k to y's exponent + delta = -126 - (exp_iy + k); + + // sub-normal + miy = iy & MANTBITS_SP32; + miy |= IMPBIT_SP32; + shift = delta > 23 ? 24 : delta; + shift = delta < 0 ? 0 : shift; + miy >>= shift; + r = delta > 0 ? as_float(miy) : r; + r = t > (float) (2 * EMAX_SP32) ? as_float(PINFBITPATT_SP32) : r; + ir = as_int(r); + r = ir <= PINFBITPATT_SP32 ? as_float(as_int(r) | signy) : r; + return r; +} + +/* Scales the float x by 2.0**n. +Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ +static inline float +scaleFloat_2(float x, int n) +{ + float t1, t2; + int n1, n2; + n1 = n / 2; + n2 = n - n1; + /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ + t1 = as_float((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + t2 = as_float((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + return (x * t1) * t2; +} + +/* Scales the float x by 2.0**n. + Assumes EMIN <= n <= EMAX, though this condition is not checked. */ +static inline float +scaleFloat_1(float x, int n) +{ + float t; + /* Construct the number t = 2.0**n */ + t = as_float((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + return x * t; +} + +/* Computes the exact product of x and y, the result being the +nearly double length number (z,zz) */ +static inline void +mul12f(float x, float y, float *z, float *zz) +{ + float hx, tx, hy, ty; + // Split x into hx (head) and tx (tail). Do the same for y. + uint u; + u = as_uint(x); + u &= 0xfffff000; + hx = as_float(u); + tx = x - hx; + u = as_uint(y); + u &= 0xfffff000; + hy = as_float(u); + ty = y - hy; + *z = x * y; + *zz = (((hx * hy - *z) + hx * ty) + tx * hy) + tx * ty; +} + +#endif //SUBNORMALS_SUPPORTED + +#if defined(COMPILING_FMOD) +__attribute__((overloadable)) float +fmod(float x, float y) +#elif defined(COMPILING_REMQUO) +__attribute__((overloadable)) float +remquo(float x, float y, int *quo) +#else +__attribute__((overloadable)) float +remainder(float x, float y) +#endif +{ +#if !defined(SUBNORMALS_SUPPORTED) + + const int loop_scale = 12; + const float fscale = 1.0f / (float) (1 << loop_scale); + + int ntimes; + float ret = 0; + int ui_x, ui_y, ui_ax, ui_ay, xexp, yexp, signx; + float af_x, af_y, af_ybase, fx, fxp, fxm, fy, w, scale, t, c, cc, v; + float yscale, scaled_w, saved_w, div, sdiv, ratio, sratio, fxexp, sub_fx; + int iw_scaled, wexp, it, i, ifx, ex, ey;; + float xr, xr0, xr_base, yr; + uint q; + + ui_x = as_int(x); + ui_y = as_int(y); + ui_ax = ui_x & EXSIGNBIT_SP32; + ui_ay = ui_y & EXSIGNBIT_SP32; + + /* special case handle */ +#if defined(COMPILING_REMQUO) + *quo = 0; +#endif + if (ui_ax > PINFBITPATT_SP32) + return x; + if (ui_ax == PINFBITPATT_SP32) + return as_float(QNANBITPATT_SP32); + if (ui_ay > PINFBITPATT_SP32) + return y; + if (ui_ay == PINFBITPATT_SP32) + return x; + if (ui_ay == 0 && ui_ax == 0) + return as_float(QNANBITPATT_SP32); + if (ui_ax == 0) + return x; + if (ui_ay == 0) + return as_float(QNANBITPATT_SP32); + + signx = ui_x & SIGNBIT_SP32; +#if defined(COMPILING_REMQUO) + int signy = ui_y & SIGNBIT_SP32; +#endif + af_x = as_float(ui_ax); + af_ybase = af_y = as_float(ui_ay); + yexp = (int) ((ui_y & EXPBITS_SP32) >> EXPSHIFTBITS_SP32); + + yscale = (float) ((yexp < 48 && ui_ay != 0) ? (48 - yexp) : 0); + if (yscale != 0) { + af_y = scaleFullRangef32(af_ybase, yscale); + } + + ui_y = as_int(af_y); + yexp = (int) ((ui_y & EXPBITS_SP32) >> EXPSHIFTBITS_SP32); + xexp = (int) ((ui_x & EXPBITS_SP32) >> EXPSHIFTBITS_SP32); + fx = af_x; + fy = af_y; + + /* Set ntimes to the number of times we need to do a + partial remainder. If the exponent of x is an exact multiple + of 24 larger than the exponent of y, and the mantissa of x is + less than the mantissa of y, ntimes will be one too large + but it doesn't matter - it just means that we'll go round + the loop below one extra time. */ + ntimes = (xexp - yexp) / loop_scale; + ntimes = xexp <= yexp ? 0 : ntimes; + + /* Set w = y * 2^(ntimes*loop_scale) */ + w = scaleFloat_2(fy, ntimes*loop_scale); + w = ntimes == 0 ? fy : w; + + /* Set scale = 2^(-loop_scale) */ + scale = ntimes == 0 ? 1.0f : fscale; + + // make sure recip does not overflow + wexp = (int) ((as_int(w) & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + saved_w = w; + scaled_w = scaleFloat_1(w, -14); + iw_scaled = wexp > 105 & wexp <= 127; + w = iw_scaled & ntimes > 0 ? scaled_w : w; + + /* Each time round the loop we compute a partial remainder. + This is done by subtracting a large multiple of w + from x each time, where w is a scaled up version of y. + The subtraction can be performed exactly when performed + in double precision, and the result at each stage can + fit exactly in a single precision number. */ + for (i = 0; i < ntimes; i++) { + /* Set fx = fx - w * t, where t is equal to trunc(dx/w). */ + div = __amdil_improved_div_f32(fx, w); + sdiv = scaleFloat_1(div, -14); + div = iw_scaled ? sdiv : div; + t = floor(div); + w = saved_w; + iw_scaled = 0; + + /* At this point, t may be one too large due to rounding of fx/w */ + + /* Compute w * t in quad precision */ + mul12f(w, t, &c, &cc); + + /* Subtract w * t from fx */ + v = fx - c; + fx = v + (((fx - v) - c) - cc); + + /* If t was one too large, fx will be negative. Add back one w */ + /* It might be possible to speed up this loop by finding + a way to compute correctly truncated t directly from fx and w. + We would then avoid the need for this check on negative fx. */ + fxp = fx + w; + fxm = fx - w; + fx = fx < 0.0f ? fxp : fx; + fx = fx >= w ? fxm : fx; + + /* Scale w down by for the next iteration */ + w *= scale; + saved_w = w; + } + + /* One more time */ + // iw = as_int(w); + ifx = as_int(fx); + fxexp = (int) ((ifx & EXPBITS_SP32) >> EXPSHIFTBITS_SP32); + // wexp = (int) ((iw & EXPBITS_SP32) >> EXPSHIFTBITS_SP32); + sub_fx = fx; + // make sure recip does not overflow + wexp = (int) ((as_int(w) & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + saved_w = w; + scaled_w = scaleFloat_1(w, -14); + iw_scaled = wexp > 105 & wexp <= 127; + w = iw_scaled ? scaled_w : w; + ratio = __amdil_improved_div_f32(fx, w); + sratio = scaleFloat_1(ratio, -14); + ratio = iw_scaled ? sratio : ratio; + t = floor(ratio); + it = (int) t; + + w = saved_w; + mul12f(w, t, &c, &cc); + + v = fx - c; + fx = v + (((fx - v) - c) - cc); + + if (fx < 0.0f) { + fx += w; + it--; + } + + if (fx >= w) { + fx -= w; + it++; + } + + // sub-normal fax + fx = fxexp == 0 ? sub_fx : fx; + +#if !defined(COMPILING_FMOD) + float scaleback = 0; +#endif + + // in case fx == 0 and we'got a divisor + it = (yscale > 30) ? 0 : ((unsigned int) it << (int) yscale); + + if (as_int(fx) != 0 && yscale != 0) { + xr = fx; + xr_base = fx; + yr = af_ybase; + q = 0; + ex = ilogb(fx); + ey = ilogb(af_ybase); + + yr = (float) scaleFullRangef32(af_ybase, (float) -ey); + xr = (float) scaleFullRangef32(fx, (float) -ex); + + for (i = ex - ey; i > 0; i--) { + q <<= 1; + xr0 = xr; + xr = (xr0 >= yr) ? xr0 - yr : xr0; + q = (xr0 >= yr) ? q + 1 : q; + xr += xr; + } + q <<= 1; + xr0 = xr; + xr = (xr0 >= yr) ? xr0 - yr : xr0; + q = (xr0 >= yr) ? q + 1 : q; + xr = scaleFullRangef32(xr, (float) ey); + + fx = (ex - ey >= 0) ? xr : xr_base; +#if !defined(COMPILING_FMOD) + q = (ex - ey >= 0) ? q : 0; + it += q; + + xexp = (int) ((as_int(fx) & EXPBITS_SP32) >> EXPSHIFTBITS_SP32); + + w = af_ybase; + if (xexp < 24) { + fx = scaleFullRangef32(fx, 48); + w = scaleFullRangef32(af_ybase, 48); + scaleback = -48; + } +#endif + } +#if !defined(COMPILING_FMOD) + /* At this point, dx lies in the range [0,dy) */ + /* For the remainder function, we need to adjust dx + so that it lies in the range (-y/2, y/2] by carefully + subtracting w (== fy == y) if necessary. */ + if (fx * 2.f > w || ((fx * 2.f == w) && (it & 1))) { + fx -= w; + it++; + } + if (scaleback != 0) { + fx = scaleFullRangef32(fx, scaleback); + } +#endif + + ret = (signx) ? as_float(as_int(fx) ^ SIGNBIT_SP32) : fx; +#if defined(COMPILING_REMQUO) + it = (signx ^ signy) ? -it : it; + *quo = it; +#endif + + return ret; + + +#else + + x = FTZ(x); + y = FTZ(y); + + int ux = as_int(x); + int ax = ux & EXSIGNBIT_SP32; + float xa = as_float(ax); + int sx = ux ^ ax; + int ex = ax >> EXPSHIFTBITS_SP32; + + int uy = as_int(y); + int ay = uy & EXSIGNBIT_SP32; + float ya = as_float(ay); +#if defined COMPILING_REMQUO + int sy = uy ^ ay; +#endif + int ey = ay >> EXPSHIFTBITS_SP32; + + float xr = as_float(0x3f800000 | (ax & 0x007fffff)); + float yr = as_float(0x3f800000 | (ay & 0x007fffff)); + int c; + int k = ex - ey; + +#if defined COMPILING_FMOD +# define BIT c = xr >= yr; xr -= c ? yr : 0.0f; xr += xr +#else + uint q = 0; +# define BIT c = xr >= yr; q = (q << 1) | c; xr -= c ? yr : 0.0f; xr += xr +#endif + + while (k > 3) { + BIT; + BIT; + BIT; + BIT; + k -= 4; + } + + while (k > 0) { + BIT; + --k; + } + +#if !defined COMPILING_FMOD + c = xr > yr; + q = (q << 1) | c; +#else + c = xr >= yr; +#endif + xr -= c ? yr : 0.0f; + + int lt = ex < ey; + +#if !defined COMPILING_FMOD + q = lt ? 0 : q; +#endif + xr = lt ? xa : xr; + yr = lt ? ya : yr; + +#if !defined COMPILING_FMOD + c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & (q & 0x1) == 0x1); + xr -= c ? yr : 0.0f; + q += c; +#endif + + float s = as_float(ey << EXPSHIFTBITS_SP32); + xr *= lt ? 1.0f : s; + +#if defined COMPILING_REMQUO + int qsgn = sx == sy ? 1 : -1; + int quot = (q & 0x7f) * qsgn; +#endif + + c = ax == ay; +#if defined COMPILING_REMQUO + quot = c ? qsgn : quot; +#endif + xr = c ? 0.0f : xr; + + xr = as_float(sx ^ as_int(xr)); + + c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | ay == 0; +#if defined COMPILING_REMQUO + quot = c ? 0 : quot; +#endif + xr = c ? as_float(QNANBITPATT_SP32) : xr; + +#if defined COMPILING_REMQUO + *quo = quot; +#endif + + return xr; + +#endif +} +
diff --git a/amd-builtins/math32/remainderF_piby2.h b/amd-builtins/math32/remainderF_piby2.h new file mode 100644 index 0000000..881ec0d --- /dev/null +++ b/amd-builtins/math32/remainderF_piby2.h
@@ -0,0 +1,256 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +static inline void +fullMulS(float *hi, float *lo, float a, float b, float bh, float bt) +{ + if (HAVE_HW_FMA32()) { + float ph = a * b; + *hi = ph; + *lo = fma(a, b, -ph); + } else { + float ah = as_float(as_uint(a) & 0xfffff000U); + float at = a - ah; + float ph = a * b; + float pt = mad(at, bt, mad(at, bh, mad(ah, bt, mad(ah, bh, -ph)))); + *hi = ph; + *lo = pt; + } +} + +static inline float +removePi2S(float *hi, float *lo, float x) +{ + // 72 bits of pi/2 + const float fpiby2_1 = (float) 0xC90FDA / 0x1.0p+23f; + const float fpiby2_1_h = (float) 0xC90 / 0x1.0p+11f; + const float fpiby2_1_t = (float) 0xFDA / 0x1.0p+23f; + + const float fpiby2_2 = (float) 0xA22168 / 0x1.0p+47f; + const float fpiby2_2_h = (float) 0xA22 / 0x1.0p+35f; + const float fpiby2_2_t = (float) 0x168 / 0x1.0p+47f; + + const float fpiby2_3 = (float) 0xC234C4 / 0x1.0p+71f; + const float fpiby2_3_h = (float) 0xC23 / 0x1.0p+59f; + const float fpiby2_3_t = (float) 0x4C4 / 0x1.0p+71f; + + const float twobypi = 0x1.45f306p-1f; + + float fnpi2 = trunc(mad(x, twobypi, 0.5f)); + + // subtract n * pi/2 from x + float rhead, rtail; + fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t); + float v = x - rhead; + float rem = v + (((x - v) - rhead) - rtail); + + float rhead2, rtail2; + fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t); + v = rem - rhead2; + rem = v + (((rem - v) - rhead2) - rtail2); + + float rhead3, rtail3; + fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t); + v = rem - rhead3; + + *hi = v + ((rem - v) - rhead3); + *lo = -rtail3; + return fnpi2; +} + +static inline int +argReductionSmallS(float *r, float *rr, float x) +{ + float fnpi2 = removePi2S(r, rr, x); + return (int)fnpi2 & 0x3; +} + +extern uint __amdil_umad_u32(uint, uint, uint); +extern uint __amdil_bitalign_i32(uint, uint, uint); + +static inline uint +bitalign(uint hi, uint lo, uint shift) +{ + if (HAVE_BITALIGN()) + return __amdil_bitalign_i32(hi, lo, shift); + else + return (hi << (32 - shift)) | (lo >> shift); +} + + +#define FULL_MUL(A, B, HI, LO) \ + LO = A * B; \ + HI = mul_hi(A, B) + +#define FULL_MAD(A, B, C, HI, LO) \ + LO = __amdil_umad_u32(A, B, C); \ + HI = mul_hi(A, B); \ + HI += LO < C + +static inline int +argReductionLargeS(float *r, float *rr, float x) +{ + int xe = (int)(as_uint(x) >> 23) - 127; + uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU); + + // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB + const uint b6 = 0xA2F9836EU; + const uint b5 = 0x4E441529U; + const uint b4 = 0xFC2757D1U; + const uint b3 = 0xF534DDC0U; + const uint b2 = 0xDB629599U; + const uint b1 = 0x3C439041U; + const uint b0 = 0xFE5163ABU; + + uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1; + + FULL_MUL(xm, b0, c0, p0); + FULL_MAD(xm, b1, c0, c1, p1); + FULL_MAD(xm, b2, c1, c0, p2); + FULL_MAD(xm, b3, c0, c1, p3); + FULL_MAD(xm, b4, c1, c0, p4); + FULL_MAD(xm, b5, c0, c1, p5); + FULL_MAD(xm, b6, c1, p7, p6); + + uint fbits = 224 + 23 - xe; + + // shift amount to get 2 lsb of integer part at top 2 bits + // min: 25 (xe=18) max: 134 (xe=127) + uint shift = 256U - 2 - fbits; + + // Shift by up to 134/32 = 4 words + int c = shift > 31; + p7 = c ? p6 : p7; + p6 = c ? p5 : p6; + p5 = c ? p4 : p5; + p4 = c ? p3 : p4; + p3 = c ? p2 : p3; + p2 = c ? p1 : p2; + p1 = c ? p0 : p1; + shift -= (-c) & 32; + + c = shift > 31; + p7 = c ? p6 : p7; + p6 = c ? p5 : p6; + p5 = c ? p4 : p5; + p4 = c ? p3 : p4; + p3 = c ? p2 : p3; + p2 = c ? p1 : p2; + shift -= (-c) & 32; + + c = shift > 31; + p7 = c ? p6 : p7; + p6 = c ? p5 : p6; + p5 = c ? p4 : p5; + p4 = c ? p3 : p4; + p3 = c ? p2 : p3; + shift -= (-c) & 32; + + c = shift > 31; + p7 = c ? p6 : p7; + p6 = c ? p5 : p6; + p5 = c ? p4 : p5; + p4 = c ? p3 : p4; + shift -= (-c) & 32; + + // bitalign cannot handle a shift of 32 + c = shift > 0; + shift = 32 - shift; + uint t7 = bitalign(p7, p6, shift); + uint t6 = bitalign(p6, p5, shift); + uint t5 = bitalign(p5, p4, shift); + p7 = c ? t7 : p7; + p6 = c ? t6 : p6; + p5 = c ? t5 : p5; + + // Get 2 lsb of int part and msb of fraction + int i = p7 >> 29; + + // Scoot up 2 more bits so only fraction remains + p7 = bitalign(p7, p6, 30); + p6 = bitalign(p6, p5, 30); + p5 = bitalign(p5, p4, 30); + + // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5 + uint flip = i & 1 ? 0xffffffffU : 0U; + uint sign = i & 1 ? 0x80000000U : 0U; + p7 = p7 ^ flip; + p6 = p6 ^ flip; + p5 = p5 ^ flip; + + // Find exponent and shift away leading zeroes and hidden bit + xe = clz(p7) + 1; + shift = 32 - xe; + p7 = bitalign(p7, p6, shift); + p6 = bitalign(p6, p5, shift); + + // Most significant part of fraction + float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9)); + + // Shift out bits we captured on q1 + p7 = bitalign(p7, p6, 32-23); + + // Get 24 more bits of fraction in another float, there are not long strings of zeroes here + int xxe = clz(p7) + 1; + p7 = bitalign(p7, p6, 32-xxe); + float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9)); + + // At this point, the fraction q1 + q0 is correct to at least 48 bits + // Now we need to multiply the fraction by pi/2 + // This loses us about 4 bits + // pi/2 = C90 FDA A22 168 C23 4C4 + + const float pio2h = (float)0xc90fda / 0x1.0p+23f; + const float pio2hh = (float)0xc90 / 0x1.0p+11f; + const float pio2ht = (float)0xfda / 0x1.0p+23f; + const float pio2t = (float)0xa22168 / 0x1.0p+47f; + + float rh, rt; + + if (HAVE_HW_FMA32()) { + rh = q1 * pio2h; + rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh))); + } else { + float q1h = as_float(as_uint(q1) & 0xfffff000); + float q1t = q1 - q1h; + rh = q1 * pio2h; + rt = mad(q1t, pio2ht, mad(q1t, pio2hh, mad(q1h, pio2ht, mad(q1h, pio2hh, -rh)))); + rt = mad(q0, pio2h, mad(q1, pio2t, rt)); + } + + float t = rh + rt; + rt = rt - (t - rh); + + *r = t; + *rr = rt; + return ((i >> 1) + (i & 1)) & 0x3; +} + +static inline int +argReductionS(float *r, float *rr, float x) +{ + if (x < 0x1.0p+23f) + return argReductionSmallS(r, rr, x); + else + return argReductionLargeS(r, rr, x); +} +
diff --git a/amd-builtins/math32/remquoF.cl b/amd-builtins/math32/remquoF.cl new file mode 100644 index 0000000..4e8b94a --- /dev/null +++ b/amd-builtins/math32/remquoF.cl
@@ -0,0 +1,45 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_REMQUO +#include "remainderF.h" + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline)) float +remquo(float x, float y, __global int *quo) +{ + int q; + float r = remquo(x, y, &q); + *quo = q; + return r; +} + +__attribute__((overloadable, always_inline)) float +remquo(float x, float y, __local int *quo) +{ + int q; + float r = remquo(x, y, &q); + *quo = q; + return r; +} +#endif +
diff --git a/amd-builtins/math32/rintF.cl b/amd-builtins/math32/rintF.cl new file mode 100644 index 0000000..caf2663 --- /dev/null +++ b/amd-builtins/math32/rintF.cl
@@ -0,0 +1,31 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +// Compile only scalar defintion for HSA + +__attribute__((overloadable, always_inline)) float +rint(float x) +{ + return __amdil_round_nearest_f32(x); +}
diff --git a/amd-builtins/math32/rootnF.cl b/amd-builtins/math32/rootnF.cl new file mode 100644 index 0000000..2d13aa1 --- /dev/null +++ b/amd-builtins/math32/rootnF.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_ROOTN +#include "powF_base.h" +
diff --git a/amd-builtins/math32/roundF.cl b/amd-builtins/math32/roundF.cl new file mode 100644 index 0000000..548ba6a --- /dev/null +++ b/amd-builtins/math32/roundF.cl
@@ -0,0 +1,33 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +round(float x) +{ + float t = trunc(x); + float d = fabs(x - t); + float o = as_float((as_int(x) & 0x80000000) | 0x3f800000); + return t + (d >= 0.5f ? o : 0.0f); +} +
diff --git a/amd-builtins/math32/rsqrtF.cl b/amd-builtins/math32/rsqrtF.cl new file mode 100644 index 0000000..dc4ab37 --- /dev/null +++ b/amd-builtins/math32/rsqrtF.cl
@@ -0,0 +1,43 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline, weak)) float +rsqrt(float x) +{ +#if !defined(SUBNORMALS_SUPPORTED) + int i = as_int(x); + int ai = i & 0x7fffffff; + int d = ai > 0 & ai < 0x00800000; + // scale subnormal by 2^26 without multiplying to avoid input flush + float s = as_float(i | 0x0d800000) - 0x1.0p-100F; + x = d ? s : x; + x = native_rsqrt(x); + x *= d ? 0x1.0p+13F : 1.0F; + return x; +#else //SUBNORMALS_SUPPORTED + return native_rsqrt(x); +#endif + + +}
diff --git a/amd-builtins/math32/sinF.cl b/amd-builtins/math32/sinF.cl new file mode 100644 index 0000000..8d02067 --- /dev/null +++ b/amd-builtins/math32/sinF.cl
@@ -0,0 +1,55 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#if 1 +#include "math32.h" +#include "remainderF_piby2.h" +#include "sincosF_piby4.h" +#endif + +__attribute__((overloadable, pure)) float +sin(float x) +{ +#if 1 + int ix = as_int(x); + int ax = ix & 0x7fffffff; + float dx = as_float(ax); + + float r0, r1; + int regn = argReductionS(&r0, &r1, dx); + + float ss = sinf_piby4_new(r0, r1); + float cc = cosf_piby4_new(r0, r1); + + float s = (regn & 1) != 0 ? cc : ss; + s = as_float(as_int(s) ^ ((regn > 1) << 31) ^ (ix ^ ax)); + + s = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : s; + + //Subnormals + s = x == 0. ? x : s; + return s; +#else + // TODO_HSA: Using native_sin for now. + return native_sin(x); +#endif +} +
diff --git a/amd-builtins/math32/sincosF.cl b/amd-builtins/math32/sincosF.cl new file mode 100644 index 0000000..7c98df2 --- /dev/null +++ b/amd-builtins/math32/sincosF.cl
@@ -0,0 +1,73 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" +#include "remainderF_piby2.h" +#include "sincosF_piby4.h" + +__attribute__ ((overloadable, always_inline)) float +sincos(float x, float *result_cos) +{ + int ix = as_int(x); + int ax = ix & 0x7fffffff; + float dx = as_float(ax); + + // Almost all args should be caught in the first branch + float r0, r1; + int regn = argReductionS(&r0, &r1, dx); + + float ss = sinf_piby4_new(r0, r1); + float cc = cosf_piby4_new(r0, r1); + + int flip = (regn > 1) << 31; + float s = (regn & 1) != 0 ? cc : ss; + s = as_float(as_int(s) ^ flip ^ (ax ^ ix)); + ss = -ss; + float c = (regn & 1) != 0 ? ss : cc; + c = as_float(as_int(c) ^ flip); + + c = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : c; + s = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : s; + + *result_cos = c; + return s; +} + +#if __OPENCL_C_VERSION__ < 200 +__attribute__ ((overloadable, always_inline)) float +sincos(float x, __local float *result_cos) +{ + float c; + float s = sincos(x, &c); + *result_cos = c; + return s; +} + +__attribute__ ((overloadable, always_inline)) float +sincos(float x, __global float *result_cos) +{ + float c; + float s = sincos(x, &c); + *result_cos = c; + return s; +} +#endif
diff --git a/amd-builtins/math32/sincosF_piby4.h b/amd-builtins/math32/sincosF_piby4.h new file mode 100644 index 0000000..f490f0e --- /dev/null +++ b/amd-builtins/math32/sincosF_piby4.h
@@ -0,0 +1,82 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +static inline float sinf_piby4_new(float x, float y) +{ + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + + const float c1 = -0.1666666666e0f; + const float c2 = 0.8333331876e-2f; + const float c3 = -0.198400874e-3f; + const float c4 = 0.272500015e-5f; + const float c5 = -2.5050759689e-08f; // 0xb2d72f34 + const float c6 = 1.5896910177e-10f; // 0x2f2ec9d3 + + float z = x * x; + float v = z * x; + float r = mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2); + float ret = x - mad(v, -c1, mad(z, mad(y, 0.5f, -v*r), -y)); + + return ret; +} + +static inline float cosf_piby4_new(float x, float y) +{ + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + + const float c1 = 0.416666666e-1f; + const float c2 = -0.138888876e-2f; + const float c3 = 0.248006008e-4f; + const float c4 = -0.2730101334e-6f; + const float c5 = 2.0875723372e-09f; // 0x310f74f6 + const float c6 = -1.1359647598e-11f; // 0xad47d74e + + float z = x * x; + float r = z * mad(z, mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2), c1); + + // if |x| < 0.3 + float qx = 0.0f; + + int ix = as_int(x) & EXSIGNBIT_SP32; + + // 0.78125 > |x| >= 0.3 + float xby4 = as_float(ix - 0x01000000); + qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx; + + // x > 0.78125 + qx = ix > 0x3f480000 ? 0.28125f : qx; + + float hz = mad(z, 0.5f, -qx); + float a = 1.0f - qx; + float ret = a - (hz - mad(z, r, -x*y)); + return ret; +} +
diff --git a/amd-builtins/math32/sincospiF_piby4.h b/amd-builtins/math32/sincospiF_piby4.h new file mode 100644 index 0000000..0bf4f75 --- /dev/null +++ b/amd-builtins/math32/sincospiF_piby4.h
@@ -0,0 +1,57 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4] +static inline float2 +sincosf_piby4(float x) +{ + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + + const float sc1 = -0.166666666638608441788607926e0F; + const float sc2 = 0.833333187633086262120839299e-2F; + const float sc3 = -0.198400874359527693921333720e-3F; + const float sc4 = 0.272500015145584081596826911e-5F; + + const float cc1 = 0.41666666664325175238031e-1F; + const float cc2 = -0.13888887673175665567647e-2F; + const float cc3 = 0.24800600878112441958053e-4F; + const float cc4 = -0.27301013343179832472841e-6F; + + float x2 = x * x; + + float2 ret; + ret.x = mad(x*x2, mad(x2, mad(x2, mad(x2, sc4, sc3), sc2), sc1), x); + ret.y = mad(x2*x2, mad(x2, mad(x2, mad(x2, cc4, cc3), cc2), cc1), mad(x2, -0.5f, 1.0f)); + return ret; +} +
diff --git a/amd-builtins/math32/sinhF.cl b/amd-builtins/math32/sinhF.cl new file mode 100644 index 0000000..dc4c15b --- /dev/null +++ b/amd-builtins/math32/sinhF.cl
@@ -0,0 +1,101 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +sinh(float x) +{ + // After dealing with special cases the computation is split into regions as follows. + // abs(x) >= max_sinh_arg: + // sinh(x) = sign(x)*Inf + // abs(x) >= small_threshold: + // sinh(x) = sign(x)*exp(abs(x))/2 computed using the splitexp and scaleDouble functions as for exp_amd(). + // abs(x) < small_threshold: + // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + // sinh(x) is then sign(x)*z. + + // Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. + USE_TABLE(float2, p_tbl, SINHCOSH_TBL); + + const float max_sinh_arg = 0x1.65a9fap+6f; + const float small_threshold = 0x1.0a2b24p+3f; + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint xs = ux ^ aux; + float y = as_float(aux); + + // We find the integer part y0 of y and the increment dy = y - y0. We then compute + // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) + // where sinh(y0) and cosh(y0) are tabulated above. + int ind = (int) y; + ind = (uint)ind > 36U ? 0 : ind; + + float dy = y - ind; + float dy2 = dy * dy; + + float sdy = mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, 0.7746188980094184251527126e-12f, 0.160576793121939886190847e-9f), + 0.250521176994133472333666e-7f), + 0.275573191913636406057211e-5f), + 0.198412698413242405162014e-3f), + 0.833333333333329931873097e-2f), + 0.166666666666666667013899e0f); + sdy = mad(sdy, dy*dy2, dy); + + float cdy = mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, 0.1163921388172173692062032e-10f, 0.208744349831471353536305e-8f), + 0.275573350756016588011357e-6f), + 0.248015872460622433115785e-4f), + 0.138888888889814854814536e-2f), + 0.416666666666660876512776e-1f), + 0.500000000000000005911074e0f); + cdy = mad(cdy, dy2, 1.0f); + + float2 tv = p_tbl[ind]; + float z = mad(tv.s1, sdy, tv.s0 * cdy); + z = as_float(xs | as_uint(z)); + + // When y is large enough so that the negative exponential is negligible, + // so sinh(y) is approximated by sign(x)*exp(y)/2. + float t = exp(y - 0x1.62e500p-1f); + float zsmall = mad(0x1.a0210ep-18f, t, t); + zsmall = as_float(xs | as_uint(zsmall)); + z = y >= small_threshold ? zsmall : z; + + // Corner cases + float zinf = as_float(PINFBITPATT_SP32 | xs); + z = y >= max_sinh_arg ? zinf : z; + z = aux > PINFBITPATT_SP32 | aux < 0x38800000U ? x : z; + + return z; +} +
diff --git a/amd-builtins/math32/sinhcoshF_table.h b/amd-builtins/math32/sinhcoshF_table.h new file mode 100644 index 0000000..29189c8 --- /dev/null +++ b/amd-builtins/math32/sinhcoshF_table.h
@@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + + +DECLARE_TABLE(float2, SINHCOSH_TBL, 37, + (float2)(0x0.000000p+0f, 0x1.000000p+0f), + (float2)(0x1.2cd9fcp+0f, 0x1.8b0756p+0f), + (float2)(0x1.d03cf6p+1f, 0x1.e18fa0p+1f), + (float2)(0x1.40926ep+3f, 0x1.422a4ap+3f), + (float2)(0x1.b4a380p+4f, 0x1.b4ee86p+4f), + (float2)(0x1.28d016p+6f, 0x1.28d6fcp+6f), + (float2)(0x1.936d22p+7f, 0x1.936e68p+7f), + (float2)(0x1.122876p+9f, 0x1.122894p+9f), + (float2)(0x1.749ea6p+10f, 0x1.749eaap+10f), + (float2)(0x1.fa7158p+11f, 0x1.fa7158p+11f), + (float2)(0x1.5829dcp+13f, 0x1.5829dep+13f), + (float2)(0x1.d3c448p+14f, 0x1.d3c448p+14f), + (float2)(0x1.3de166p+16f, 0x1.3de166p+16f), + (float2)(0x1.b00b5ap+17f, 0x1.b00b5ap+17f), + (float2)(0x1.259ac4p+19f, 0x1.259ac4p+19f), + (float2)(0x1.8f0ccap+20f, 0x1.8f0ccap+20f), + (float2)(0x1.0f2ebep+22f, 0x1.0f2ebep+22f), + (float2)(0x1.709348p+23f, 0x1.709348p+23f), + (float2)(0x1.f4f220p+24f, 0x1.f4f220p+24f), + (float2)(0x1.546d90p+26f, 0x1.546d90p+26f), + (float2)(0x1.ceb088p+27f, 0x1.ceb088p+27f), + (float2)(0x1.3a6e20p+29f, 0x1.3a6e20p+29f), + (float2)(0x1.ab5adcp+30f, 0x1.ab5adcp+30f), + (float2)(0x1.226af4p+32f, 0x1.226af4p+32f), + (float2)(0x1.8ab7fcp+33f, 0x1.8ab7fcp+33f), + (float2)(0x1.0c3d3ap+35f, 0x1.0c3d3ap+35f), + (float2)(0x1.6c9326p+36f, 0x1.6c9326p+36f), + (float2)(0x1.ef8230p+37f, 0x1.ef8230p+37f), + (float2)(0x1.50bba4p+39f, 0x1.50bba4p+39f), + (float2)(0x1.c9aae4p+40f, 0x1.c9aae4p+40f), + (float2)(0x1.370470p+42f, 0x1.370470p+42f), + (float2)(0x1.a6b766p+43f, 0x1.a6b766p+43f), + (float2)(0x1.1f43fcp+45f, 0x1.1f43fcp+45f), + (float2)(0x1.866f34p+46f, 0x1.866f34p+46f), + (float2)(0x1.0953e2p+48f, 0x1.0953e2p+48f), + (float2)(0x1.689e22p+49f, 0x1.689e22p+49f), + (float2)(0x1.ea215ap+50f, 0x1.ea215ap+50f), +) +
diff --git a/amd-builtins/math32/sinpiF.cl b/amd-builtins/math32/sinpiF.cl new file mode 100644 index 0000000..759143d --- /dev/null +++ b/amd-builtins/math32/sinpiF.cl
@@ -0,0 +1,85 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" +#include "sincospiF_piby4.h" +#if !defined(SUBNORMALS_SUPPORTED) +#include "floattointconversion.h" +#endif //SUBNORMALS_SUPPORTED + + +__attribute__((overloadable)) float +sinpi(float x) +{ + const float pi = 3.1415926535897932F; + + int ix = as_int(x); + int xsgn = ix & 0x80000000; + ix ^= xsgn; + float ax = as_float(ix); + int iax = (int)ax; + float r = ax - iax; + int xodd = xsgn ^ (iax & 0x1 ? 0x80000000 : 0); + + // Initialize with return for +-Inf and NaN + int ir = 0x7fc00000; + + // 2^23 <= |x| < Inf, the result is always integer + ir = ix < 0x7f800000 ? xsgn : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + float a = 1.0f - r; + int e = 0; + + // r <= 0.75 + int c = r <= 0.75f; + a = c ? r - 0.5f : a; + e = c ? 1 : e; + + // r < 0.5 + c = r < 0.5f; + a = c ? 0.5f - r : a; + + // 0 < r <= 0.25 + c = r <= 0.25f; + a = c ? r : a; + e = c ? 0 : e; + + float2 t = sincosf_piby4(a * pi); + int jr = xodd ^ as_int(e ? t.hi : t.lo); + + ir = ix < 0x4b000000 ? jr : ir; + + +#if !defined(SUBNORMALS_SUPPORTED) + if(ax <= 0.) + { + double d = float_uint_to_double(as_uint(x)); + ir = (double_to_float_uint(d*pi)); + } +#endif //SUBNORMALS_SUPPORTED + + return as_float(ir); +} +
diff --git a/amd-builtins/math32/sqrtF.cl b/amd-builtins/math32/sqrtF.cl new file mode 100644 index 0000000..db13fe0 --- /dev/null +++ b/amd-builtins/math32/sqrtF.cl
@@ -0,0 +1,41 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline, weak)) float +sqrt(float x) +{ +#if !defined(SUBNORMALS_SUPPORTED) + int i = as_int(x); + int ai = i & 0x7fffffff; + int d = ai > 0 & ai < 0x00800000; + // scale subnormal by 2^26 without multiplying to avoid input flush + float s = as_float(i | 0x0d800000) - 0x1.0p-100F; + x = d ? s : x; + x = MATH_SQRT(x); + x *= d ? 0x1.0p-13F : 1.0F; + return x; +#else //SUBNORMALS_SUPPORTED + return native_sqrt(x); +#endif +}
diff --git a/amd-builtins/math32/tables32.cl b/amd-builtins/math32/tables32.cl new file mode 100644 index 0000000..d453f5f --- /dev/null +++ b/amd-builtins/math32/tables32.cl
@@ -0,0 +1,34 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +#include "expF_table.h" + +#include "logF_table.h" + +#include "sinhcoshF_table.h" + +#include "atan2F_table.h" + +#include "cbrtF_table.h" +
diff --git a/amd-builtins/math32/tanF.cl b/amd-builtins/math32/tanF.cl new file mode 100644 index 0000000..16fa44f --- /dev/null +++ b/amd-builtins/math32/tanF.cl
@@ -0,0 +1,45 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" +#include "remainderF_piby2.h" +#include "tanF_piby4.h" + +__attribute__((overloadable)) float +tan(float x) +{ + int ix = as_int(x); + int ax = ix & 0x7fffffff; + float dx = as_float(ax); + + float r0, r1; + int regn = argReductionS(&r0, &r1, dx); + + float t = tanf_piby4_new(r0 + r1, regn); + t = as_float(as_int(t) ^ (ix ^ ax)); + + t = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : t; + //Take care of subnormals + t = (x == 0.) ? x : t; + return t; +} +
diff --git a/amd-builtins/math32/tanF_piby4.h b/amd-builtins/math32/tanF_piby4.h new file mode 100644 index 0000000..2d64b0f --- /dev/null +++ b/amd-builtins/math32/tanF_piby4.h
@@ -0,0 +1,39 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +static inline float +tanf_piby4_new(float x, int regn) +{ + // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4]. + float r = x * x; + + float a = mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f); + + float b = mad(r, + mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f), + 1.15588821434688393452299f); + + float t = mad(x*r, __amdil_improved_div_f32(a, b), x); + float tr = -MATH_RECIP(t); + + return regn & 1 ? tr : t; +}
diff --git a/amd-builtins/math32/tanhF.cl b/amd-builtins/math32/tanhF.cl new file mode 100644 index 0000000..42cde35 --- /dev/null +++ b/amd-builtins/math32/tanhF.cl
@@ -0,0 +1,72 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable)) float +tanh(float x) +{ + // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent + // to the following three formulae: + // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) + // 2. (1 - (2/(exp(2*x) + 1 ))) + // 3. (exp(2*x) - 1)/(exp(2*x) + 1) + // but computationally, some formulae are better on some ranges. + + const float large_threshold = 0x1.0a2b24p+3f; + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint xs = ux ^ aux; + + float y = as_float(aux); + float y2 = y*y; + + float a1 = mad(y2, + mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F), + -0.28192806108402678e0F); + float b1 = mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F); + + float a2 = mad(y2, + mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F), + -0.24069858695196524e0F); + float b2 = mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F); + + int c = y < 0.9f; + float a = c ? a1 : a2; + float b = c ? b1 : b2; + float zlo = mad(MATH_DIVIDE(a, b), y*y2, y); + + float p = exp(2.0f * y) + 1.0f; + float zhi = 1.0F - MATH_DIVIDE(2.0F, p); + + float z = y <= 1.0f ? zlo : zhi; + z = as_float(xs | as_uint(z)); + + // Edge cases + float sone = as_float(0x3f800000U | xs); + z = y > large_threshold ? sone : z; + z = aux < 0x39000000 | aux > 0x7f800000 ? x : z; + + return z; +} +
diff --git a/amd-builtins/math32/tanpiF.cl b/amd-builtins/math32/tanpiF.cl new file mode 100644 index 0000000..ff13a19 --- /dev/null +++ b/amd-builtins/math32/tanpiF.cl
@@ -0,0 +1,105 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +#if !defined(SUBNORMALS_SUPPORTED) +#include "floattointconversion.h" +#endif //SUBNORMALS_SUPPORTED + +static inline float2 +tanf_piby4(float x) +{ + // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4] + float r = x*x; + float a = mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f); + float b = mad(r, mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f), + 1.15588821434688393452299f); + float t = mad(x*r, MATH_DIVIDE(a,b), x); + return (float2)(t, -MATH_RECIP(t)); +} + +__attribute__((overloadable)) float +tanpi(float x) +{ + const float pi = 3.1415926535897932F; + + int ix = as_int(x); + int xsgn = ix & 0x80000000; + int xnsgn = xsgn ^ 0x80000000; + ix ^= xsgn; + float ax = as_float(ix); + int iax = (int)ax; + float r = ax - iax; + int xodd = xsgn ^ (iax & 0x1 ? 0x80000000 : 0); + + // Initialize with return for +-Inf and NaN + int ir = 0x7fc00000; + + // 2^24 <= |x| < Inf, the result is always even integer + ir = ix < 0x7f800000 ? xsgn : ir; + + // 2^23 <= |x| < 2^24, the result is always integer + ir = ix < 0x4b800000 ? xodd : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + float a = 1.0f - r; + int e = 0; + int s = xnsgn; + + // r <= 0.75 + int c = r <= 0.75f; + a = c ? r - 0.5f : a; + e = c ? 1 : e; + s = c ? xsgn : s; + + // r < 0.5 + c = r < 0.5f; + a = c ? 0.5f - r : a; + s = c ? xnsgn : s; + + // 0 < r <= 0.25 + c = r <= 0.25f; + a = c ? r : a; + e = c ? 0 : e; + s = c ? xsgn : s; + + float2 t = tanf_piby4(a * pi); + int jr = s ^ as_int(e ? t.hi : t.lo); + + jr = r == 0.5f ? xodd | 0x7f800000 : jr; + + ir = ix < 0x4b000000 ? jr : ir; + +#if !defined(SUBNORMALS_SUPPORTED) + if(ax <= 0.) + { + double d = float_uint_to_double(as_uint(x)); + ir = (double_to_float_uint(d*pi)); + } +#endif //SUBNORMALS_SUPPORTED + + return as_float(ir); +} +
diff --git a/amd-builtins/math32/tgammaF.cl b/amd-builtins/math32/tgammaF.cl new file mode 100644 index 0000000..7c76f2c --- /dev/null +++ b/amd-builtins/math32/tgammaF.cl
@@ -0,0 +1,44 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +tgamma(float x) +{ + const float pi = 3.1415926535897932384626433832795f; + float ax = fabs(x); + float lg = lgamma(ax); + float g = exp(lg); + + if (x < 0.0f) + { + float z = sinpi(x); + g = g * ax * z; + g = pi / g; + g = g == 0 ? as_float(PINFBITPATT_SP32) : g; + g = z == 0 ? as_float(QNANBITPATT_SP32) : g; + } + + return g; +} +
diff --git a/amd-builtins/math32/truncF.cl b/amd-builtins/math32/truncF.cl new file mode 100644 index 0000000..5559ed1 --- /dev/null +++ b/amd-builtins/math32/truncF.cl
@@ -0,0 +1,29 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math32.h" + +__attribute__((overloadable, always_inline)) float +trunc(float x) +{ + return __amdil_round_zero_f32(x); +}
diff --git a/amd-builtins/math32/vexpandF.cl b/amd-builtins/math32/vexpandF.cl new file mode 100644 index 0000000..9085000 --- /dev/null +++ b/amd-builtins/math32/vexpandF.cl
@@ -0,0 +1,908 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + + +__attribute__((overloadable, always_inline, weak)) float16 +frexp(float16 x, int16 *p) +{ + float16 r; + int16 i; + int8 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +frexp(float16 x, __global int16 *p) +{ + float16 r; + int16 i; + int8 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +frexp(float16 x, __local int16 *p) +{ + float16 r; + int16 i; + int8 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float8 +frexp(float8 x, int8 *p) +{ + float8 r; + int8 i; + int4 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +frexp(float8 x, __global int8 *p) +{ + float8 r; + int8 i; + int4 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +frexp(float8 x, __local int8 *p) +{ + float8 r; + int8 i; + int4 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float4 +frexp(float4 x, int4 *p) +{ + float4 r; + int4 i; + int2 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +frexp(float4 x, __global int4 *p) +{ + float4 r; + int4 i; + int2 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +frexp(float4 x, __local int4 *p) +{ + float4 r; + int4 i; + int2 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float3 +frexp(float3 x, int3 *p) +{ + float3 r; + int3 i; + int2 j; + int k; + + r.s01 = frexp(x.s01, &j); + i.s01 = j; + r.s2 = frexp(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +frexp(float3 x, __global int3 *p) +{ + float3 r; + int3 i; + int2 j; + int k; + + r.s01 = frexp(x.s01, &j); + i.s01 = j; + r.s2 = frexp(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +frexp(float3 x, __local int3 *p) +{ + float3 r; + int3 i; + int2 j; + int k; + + r.s01 = frexp(x.s01, &j); + i.s01 = j; + r.s2 = frexp(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float2 +frexp(float2 x, int2 *p) +{ + float2 r; + int2 i; + int j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +frexp(float2 x, __global int2 *p) +{ + float2 r; + int2 i; + int j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +frexp(float2 x, __local int2 *p) +{ + float2 r; + int2 i; + int j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float16 +lgamma_r(float16 x, int16 *p) +{ + float16 r; + int16 i; + int8 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +lgamma_r(float16 x, __global int16 *p) +{ + float16 r; + int16 i; + int8 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +lgamma_r(float16 x, __local int16 *p) +{ + float16 r; + int16 i; + int8 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float8 +lgamma_r(float8 x, int8 *p) +{ + float8 r; + int8 i; + int4 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +lgamma_r(float8 x, __global int8 *p) +{ + float8 r; + int8 i; + int4 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +lgamma_r(float8 x, __local int8 *p) +{ + float8 r; + int8 i; + int4 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float4 +lgamma_r(float4 x, int4 *p) +{ + float4 r; + int4 i; + int2 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +lgamma_r(float4 x, __global int4 *p) +{ + float4 r; + int4 i; + int2 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +lgamma_r(float4 x, __local int4 *p) +{ + float4 r; + int4 i; + int2 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float3 +lgamma_r(float3 x, int3 *p) +{ + float3 r; + int3 i; + int2 j; + int k; + + r.s01 = lgamma_r(x.s01, &j); + i.s01 = j; + r.s2 = lgamma_r(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +lgamma_r(float3 x, __global int3 *p) +{ + float3 r; + int3 i; + int2 j; + int k; + + r.s01 = lgamma_r(x.s01, &j); + i.s01 = j; + r.s2 = lgamma_r(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +lgamma_r(float3 x, __local int3 *p) +{ + float3 r; + int3 i; + int2 j; + int k; + + r.s01 = lgamma_r(x.s01, &j); + i.s01 = j; + r.s2 = lgamma_r(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float2 +lgamma_r(float2 x, int2 *p) +{ + float2 r; + int2 i; + int j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +lgamma_r(float2 x, __global int2 *p) +{ + float2 r; + int2 i; + int j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +lgamma_r(float2 x, __local int2 *p) +{ + float2 r; + int2 i; + int j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float16 +remquo(float16 x, float16 y, int16 *p) +{ + float16 r; + int16 i; + int8 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +remquo(float16 x, float16 y, __global int16 *p) +{ + float16 r; + int16 i; + int8 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +remquo(float16 x, float16 y, __local int16 *p) +{ + float16 r; + int16 i; + int8 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float8 +remquo(float8 x, float8 y, int8 *p) +{ + float8 r; + int8 i; + int4 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +remquo(float8 x, float8 y, __global int8 *p) +{ + float8 r; + int8 i; + int4 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +remquo(float8 x, float8 y, __local int8 *p) +{ + float8 r; + int8 i; + int4 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float4 +remquo(float4 x, float4 y, int4 *p) +{ + float4 r; + int4 i; + int2 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +remquo(float4 x, float4 y, __global int4 *p) +{ + float4 r; + int4 i; + int2 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +remquo(float4 x, float4 y, __local int4 *p) +{ + float4 r; + int4 i; + int2 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float3 +remquo(float3 x, float3 y, int3 *p) +{ + float3 r; + int3 i; + int2 j; + int k; + + r.s01 = remquo(x.s01, y.s01, &j); + i.s01 = j; + r.s2 = remquo(x.s2, y.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +remquo(float3 x, float3 y, __global int3 *p) +{ + float3 r; + int3 i; + int2 j; + int k; + + r.s01 = remquo(x.s01, y.s01, &j); + i.s01 = j; + r.s2 = remquo(x.s2, y.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +remquo(float3 x, float3 y, __local int3 *p) +{ + float3 r; + int3 i; + int2 j; + int k; + + r.s01 = remquo(x.s01, y.s01, &j); + i.s01 = j; + r.s2 = remquo(x.s2, y.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float2 +remquo(float2 x, float2 y, int2 *p) +{ + float2 r; + int2 i; + int j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +remquo(float2 x, float2 y, __global int2 *p) +{ + float2 r; + int2 i; + int j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +remquo(float2 x, float2 y, __local int2 *p) +{ + float2 r; + int2 i; + int j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif +
diff --git a/amd-builtins/math32/xvexpandF.cl b/amd-builtins/math32/xvexpandF.cl new file mode 100644 index 0000000..bffad1d --- /dev/null +++ b/amd-builtins/math32/xvexpandF.cl
@@ -0,0 +1,909 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// XXX this file can be removed after clp is implemented + +__attribute__((overloadable, always_inline, weak)) float16 +fract(float16 x, float16 *p) +{ + float16 r; + float16 t; + float8 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +fract(float16 x, __global float16 *p) +{ + float16 r; + float16 t; + float8 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +fract(float16 x, __local float16 *p) +{ + float16 r; + float16 t; + float8 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float8 +fract(float8 x, float8 *p) +{ + float8 r; + float8 t; + float4 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +fract(float8 x, __global float8 *p) +{ + float8 r; + float8 t; + float4 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +fract(float8 x, __local float8 *p) +{ + float8 r; + float8 t; + float4 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float4 +fract(float4 x, float4 *p) +{ + float4 r; + float4 t; + float2 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +fract(float4 x, __global float4 *p) +{ + float4 r; + float4 t; + float2 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +fract(float4 x, __local float4 *p) +{ + float4 r; + float4 t; + float2 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float3 +fract(float3 x, float3 *p) +{ + float3 r; + float3 t; + float2 a; + float b; + + r.s01 = fract(x.s01, &a); + t.s01 = a; + r.s2 = fract(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +fract(float3 x, __global float3 *p) +{ + float3 r; + float3 t; + float2 a; + float b; + + r.s01 = fract(x.s01, &a); + t.s01 = a; + r.s2 = fract(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +fract(float3 x, __local float3 *p) +{ + float3 r; + float3 t; + float2 a; + float b; + + r.s01 = fract(x.s01, &a); + t.s01 = a; + r.s2 = fract(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float2 +fract(float2 x, float2 *p) +{ + float2 r; + float2 t; + float a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +fract(float2 x, __global float2 *p) +{ + float2 r; + float2 t; + float a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +fract(float2 x, __local float2 *p) +{ + float2 r; + float2 t; + float a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float16 +modf(float16 x, float16 *p) +{ + float16 r; + float16 t; + float8 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +modf(float16 x, __global float16 *p) +{ + float16 r; + float16 t; + float8 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +modf(float16 x, __local float16 *p) +{ + float16 r; + float16 t; + float8 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float8 +modf(float8 x, float8 *p) +{ + float8 r; + float8 t; + float4 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +modf(float8 x, __global float8 *p) +{ + float8 r; + float8 t; + float4 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +modf(float8 x, __local float8 *p) +{ + float8 r; + float8 t; + float4 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float4 +modf(float4 x, float4 *p) +{ + float4 r; + float4 t; + float2 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +modf(float4 x, __global float4 *p) +{ + float4 r; + float4 t; + float2 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +modf(float4 x, __local float4 *p) +{ + float4 r; + float4 t; + float2 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float3 +modf(float3 x, float3 *p) +{ + float3 r; + float3 t; + float2 a; + float b; + + r.s01 = modf(x.s01, &a); + t.s01 = a; + r.s2 = modf(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +modf(float3 x, __global float3 *p) +{ + float3 r; + float3 t; + float2 a; + float b; + + r.s01 = modf(x.s01, &a); + t.s01 = a; + r.s2 = modf(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +modf(float3 x, __local float3 *p) +{ + float3 r; + float3 t; + float2 a; + float b; + + r.s01 = modf(x.s01, &a); + t.s01 = a; + r.s2 = modf(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float2 +modf(float2 x, float2 *p) +{ + float2 r; + float2 t; + float a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +modf(float2 x, __global float2 *p) +{ + float2 r; + float2 t; + float a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +modf(float2 x, __local float2 *p) +{ + float2 r; + float2 t; + float a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float16 +sincos(float16 x, float16 *p) +{ + float16 r; + float16 t; + float8 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +sincos(float16 x, __global float16 *p) +{ + float16 r; + float16 t; + float8 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float16 +sincos(float16 x, __local float16 *p) +{ + float16 r; + float16 t; + float8 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float8 +sincos(float8 x, float8 *p) +{ + float8 r; + float8 t; + float4 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +sincos(float8 x, __global float8 *p) +{ + float8 r; + float8 t; + float4 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float8 +sincos(float8 x, __local float8 *p) +{ + float8 r; + float8 t; + float4 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float4 +sincos(float4 x, float4 *p) +{ + float4 r; + float4 t; + float2 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +sincos(float4 x, __global float4 *p) +{ + float4 r; + float4 t; + float2 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float4 +sincos(float4 x, __local float4 *p) +{ + float4 r; + float4 t; + float2 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float3 +sincos(float3 x, float3 *p) +{ + float3 r; + float3 t; + float2 a; + float b; + + r.s01 = sincos(x.s01, &a); + t.s01 = a; + r.s2 = sincos(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +sincos(float3 x, __global float3 *p) +{ + float3 r; + float3 t; + float2 a; + float b; + + r.s01 = sincos(x.s01, &a); + t.s01 = a; + r.s2 = sincos(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float3 +sincos(float3 x, __local float3 *p) +{ + float3 r; + float3 t; + float2 a; + float b; + + r.s01 = sincos(x.s01, &a); + t.s01 = a; + r.s2 = sincos(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) float2 +sincos(float2 x, float2 *p) +{ + float2 r; + float2 t; + float a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +sincos(float2 x, __global float2 *p) +{ + float2 r; + float2 t; + float a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) float2 +sincos(float2 x, __local float2 *p) +{ + float2 r; + float2 t; + float a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif +
diff --git a/amd-builtins/math64/acosD.cl b/amd-builtins/math64/acosD.cl new file mode 100644 index 0000000..c652ba8 --- /dev/null +++ b/amd-builtins/math64/acosD.cl
@@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +acos(double x) +{ + // Computes arccos(x). + // The argument is first reduced by noting that arccos(x) + // is invalid for abs(x) > 1. For denormal and small + // arguments arccos(x) = pi/2 to machine accuracy. + // Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arccos(x) = pi/2 - arcsin(x) + // = pi/2 - (x + x^3*R(x^2)) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + const double pi = 3.1415926535897933e+00; /* 0x400921fb54442d18 */ + const double piby2 = 1.5707963267948965580e+00; /* 0x3ff921fb54442d18 */ + const double piby2_head = 1.5707963267948965580e+00; /* 0x3ff921fb54442d18 */ + const double piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */ + + double y = fabs(x); + int xneg = as_int2(x).hi < 0; + int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64; + + // abs(x) >= 0.5 + int transform = xexp >= -1; + + double rt = 0.5 * (1.0 - y); + double y2 = y * y; + double r = transform ? rt : y2; + + // Use a rational approximation for [0.0, 0.5] + double un = fma(r, + fma(r, + fma(r, + fma(r, + fma(r, 0.0000482901920344786991880522822991, + 0.00109242697235074662306043804220), + -0.0549989809235685841612020091328), + 0.275558175256937652532686256258), + -0.445017216867635649900123110649), + 0.227485835556935010735943483075); + + double ud = fma(r, + fma(r, + fma(r, + fma(r, 0.105869422087204370341222318533, + -0.943639137032492685763471240072), + 2.76568859157270989520376345954), + -3.28431505720958658909889444194), + 1.36491501334161032038194214209); + + double u = r * MATH_DIVIDE(un, ud); + + // Reconstruct acos carefully in transformed region + double s = sqrt(r); + double ztn = fma(-2.0, (s + fma(s, u, -piby2_tail)), pi); + + double s1 = as_double(as_ulong(s) & 0xffffffff00000000UL); + double c = MATH_DIVIDE(fma(-s1, s1, r), s + s1); + double ztp = 2.0 * (s1 + fma(s, u, c)); + double zt = xneg ? ztn : ztp; + double z = piby2_head - (x - fma(-x, u, piby2_tail)); + + z = transform ? zt : z; + + z = xexp < -56 ? piby2 : z; +/* z = xexp >= 0 ? as_double(QNANBITPATT_DP64) : z; This check for nan is not working */ + z = isnan(x) ? as_double((as_ulong(x) | QNANBITPATT_DP64)) : z; + z = x == 1.0 ? 0.0 : z; + z = x == -1.0 ? pi : z; + + return z; +} +
diff --git a/amd-builtins/math64/acoshD.cl b/amd-builtins/math64/acoshD.cl new file mode 100644 index 0000000..5c844d4 --- /dev/null +++ b/amd-builtins/math64/acoshD.cl
@@ -0,0 +1,94 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" +#include "ep_logD.h" + +__attribute__((overloadable)) double +acosh(double x) +{ + const double recrteps = 0x1.6a09e667f3bcdp+26; // 1/sqrt(eps) = 9.49062656242515593767e+07 + //log2_lead and log2_tail sum to an extra-precise version of log(2) + const double log2_lead = 0x1.62e42ep-1; + const double log2_tail = 0x1.efa39ef35793cp-25; + + // Handle x >= 128 here + int xlarge = x > recrteps; + double r = x + sqrt(fma(x, x, -1.0)); + r = xlarge ? x : r; + + int xexp; + double r1, r2; + ep_log(r, &xexp, &r1, &r2); + + double dxexp = xexp + xlarge; + r1 = fma(dxexp, log2_lead, r1); + r2 = fma(dxexp, log2_tail, r2); + + double ret1 = r1 + r2; + + // Handle 1 < x < 128 here + // We compute the value + // t = x - 1.0 + sqrt(2.0*(x - 1.0) + (x - 1.0)*(x - 1.0)) + // using simulated quad precision. + double t = x - 1.0; + double u1 = t * 2.0; + + // (t,0) * (t,0) -> (v1, v2) + double v1 = t * t; + double v2 = fma(t, t, -v1); + + // (u1,0) + (v1,v2) -> (w1,w2) + r = u1 + v1; + double s = (((u1 - r) + v1) + v2); + double w1 = r + s; + double w2 = (r - w1) + s; + + // sqrt(w1,w2) -> (u1,u2) + double p1 = sqrt(w1); + double a1 = p1*p1; + double a2 = fma(p1, p1, -a1); + double temp = (((w1 - a1) - a2) + w2); + double p2 = MATH_DIVIDE(temp * 0.5, p1); + u1 = p1 + p2; + double u2 = (p1 - u1) + p2; + + // (u1,u2) + (t,0) -> (r1,r2) + r = u1 + t; + s = ((u1 - r) + t) + u2; + // r1 = r + s; + // r2 = (r - r1) + s; + // t = r1 + r2; + t = r + s; + + // For arguments 1.13 <= x <= 1.5 the log1p function is good enough + double ret2 = log1p(t); + + ulong ux = as_ulong(x); + double ret = x >= 128.0 ? ret1 : ret2; + + ret = ux >= 0x7FF0000000000000 ? x : ret; + ret = x == 1.0 ? 0.0 : ret; + ret = (ux & SIGNBIT_DP64) != 0UL | x < 1.0 ? as_double(QNANBITPATT_DP64) : ret; + + return ret; +}
diff --git a/amd-builtins/math64/acospiD.cl b/amd-builtins/math64/acospiD.cl new file mode 100644 index 0000000..e8fc7ea --- /dev/null +++ b/amd-builtins/math64/acospiD.cl
@@ -0,0 +1,100 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +acospi(double x) +{ + // Computes arccos(x). + // The argument is first reduced by noting that arccos(x) + // is invalid for abs(x) > 1. For denormal and small + // arguments arccos(x) = pi/2 to machine accuracy. + // Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arccos(x) = pi/2 - arcsin(x) + // = pi/2 - (x + x^3*R(x^2)) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + const double pi = 0x1.921fb54442d18p+1; + const double piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */ + + double y = fabs(x); + int xneg = as_int2(x).hi < 0; + int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64; + + // abs(x) >= 0.5 + int transform = xexp >= -1; + + // Transform y into the range [0,0.5) + double r1 = 0.5 * (1.0 - y); + double s = sqrt(r1); + double r = y * y; + r = transform ? r1 : r; + y = transform ? s : y; + + // Use a rational approximation for [0.0, 0.5] + double un = fma(r, + fma(r, + fma(r, + fma(r, + fma(r, 0.0000482901920344786991880522822991, + 0.00109242697235074662306043804220), + -0.0549989809235685841612020091328), + 0.275558175256937652532686256258), + -0.445017216867635649900123110649), + 0.227485835556935010735943483075); + + double ud = fma(r, + fma(r, + fma(r, + fma(r, 0.105869422087204370341222318533, + -0.943639137032492685763471240072), + 2.76568859157270989520376345954), + -3.28431505720958658909889444194), + 1.36491501334161032038194214209); + + double u = r * MATH_DIVIDE(un, ud); + + // Reconstruct acos carefully in transformed region + double res1 = fma(-2.0, MATH_DIVIDE(s + fma(y, u, -piby2_tail), pi), 1.0); + double s1 = as_double(as_ulong(s) & 0xffffffff00000000UL); + double c = MATH_DIVIDE(fma(-s1, s1, r), s + s1); + double res2 = MATH_DIVIDE(fma(2.0, s1, fma(2.0, c, 2.0 * y * u)), pi); + res1 = xneg ? res1 : res2; + res2 = 0.5 - fma(x, u, x) / pi; + res1 = transform ? res1 : res2; + + const double qnan = as_double(QNANBITPATT_DP64); + res2 = x == 1.0 ? 0.0 : qnan; + res2 = x == -1.0 ? 1.0 : res2; + res1 = xexp >= 0 ? res2 : res1; + res1 = xexp < -56 ? 0.5 : res1; + + return res1; +} +
diff --git a/amd-builtins/math64/asinD.cl b/amd-builtins/math64/asinD.cl new file mode 100644 index 0000000..cc81114 --- /dev/null +++ b/amd-builtins/math64/asinD.cl
@@ -0,0 +1,97 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +asin(double x) +{ + // Computes arcsin(x). + // The argument is first reduced by noting that arcsin(x) + // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). + // For denormal and small arguments arcsin(x) = x to machine + // accuracy. Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arcsin(x) = x + x^3*R(x^2) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */ + const double hpiby2_head = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */ + const double piby2 = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */ + + double y = fabs(x); + int xneg = as_int2(x).hi < 0; + int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64; + + // abs(x) >= 0.5 + int transform = xexp >= -1; + + double rt = 0.5 * (1.0 - y); + double y2 = y * y; + double r = transform ? rt : y2; + + // Use a rational approximation for [0.0, 0.5] + + double un = fma(r, + fma(r, + fma(r, + fma(r, + fma(r, 0.0000482901920344786991880522822991, + 0.00109242697235074662306043804220), + -0.0549989809235685841612020091328), + 0.275558175256937652532686256258), + -0.445017216867635649900123110649), + 0.227485835556935010735943483075); + + double ud = fma(r, + fma(r, + fma(r, + fma(r, 0.105869422087204370341222318533, + -0.943639137032492685763471240072), + 2.76568859157270989520376345954), + -3.28431505720958658909889444194), + 1.36491501334161032038194214209); + + double u = r * MATH_DIVIDE(un, ud); + + // Reconstruct asin carefully in transformed region + double s = sqrt(r); + double sh = as_double(as_ulong(s) & 0xffffffff00000000UL); + double c = MATH_DIVIDE(fma(-sh, sh, r), s + sh); + double p = fma(2.0*s, u, -fma(-2.0, c, piby2_tail)); + double q = fma(-2.0, sh, hpiby2_head); + double vt = hpiby2_head - (p - q); + double v = fma(y, u, y); + v = transform ? vt : v; + + v = xexp < -28 ? y : v; + v = xexp >= 0 ? as_double(QNANBITPATT_DP64) : v; + v = y == 1.0 ? piby2 : v; + + return xneg ? -v : v; +} +
diff --git a/amd-builtins/math64/asinhD.cl b/amd-builtins/math64/asinhD.cl new file mode 100644 index 0000000..e856419 --- /dev/null +++ b/amd-builtins/math64/asinhD.cl
@@ -0,0 +1,238 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" +#include "ep_logD.h" + +#define NA0 -0.12845379283524906084997e0 +#define NA1 -0.21060688498409799700819e0 +#define NA2 -0.10188951822578188309186e0 +#define NA3 -0.13891765817243625541799e-1 +#define NA4 -0.10324604871728082428024e-3 + +#define DA0 0.77072275701149440164511e0 +#define DA1 0.16104665505597338100747e1 +#define DA2 0.11296034614816689554875e1 +#define DA3 0.30079351943799465092429e0 +#define DA4 0.235224464765951442265117e-1 + +#define NB0 -0.12186605129448852495563e0 +#define NB1 -0.19777978436593069928318e0 +#define NB2 -0.94379072395062374824320e-1 +#define NB3 -0.12620141363821680162036e-1 +#define NB4 -0.903396794842691998748349e-4 + +#define DB0 0.73119630776696495279434e0 +#define DB1 0.15157170446881616648338e1 +#define DB2 0.10524909506981282725413e1 +#define DB3 0.27663713103600182193817e0 +#define DB4 0.21263492900663656707646e-1 + +#define NC0 -0.81210026327726247622500e-1 +#define NC1 -0.12327355080668808750232e0 +#define NC2 -0.53704925162784720405664e-1 +#define NC3 -0.63106739048128554465450e-2 +#define NC4 -0.35326896180771371053534e-4 + +#define DC0 0.48726015805581794231182e0 +#define DC1 0.95890837357081041150936e0 +#define DC2 0.62322223426940387752480e0 +#define DC3 0.15028684818508081155141e0 +#define DC4 0.10302171620320141529445e-1 + +#define ND0 -0.4638179204422665073e-1 +#define ND1 -0.7162729496035415183e-1 +#define ND2 -0.3247795155696775148e-1 +#define ND3 -0.4225785421291932164e-2 +#define ND4 -0.3808984717603160127e-4 +#define ND5 0.8023464184964125826e-6 + +#define DD0 0.2782907534642231184e0 +#define DD1 0.5549945896829343308e0 +#define DD2 0.3700732511330698879e0 +#define DD3 0.9395783438240780722e-1 +#define DD4 0.7200057974217143034e-2 + +#define NE0 -0.121224194072430701e-4 +#define NE1 -0.273145455834305218e-3 +#define NE2 -0.152866982560895737e-2 +#define NE3 -0.292231744584913045e-2 +#define NE4 -0.174670900236060220e-2 +#define NE5 -0.891754209521081538e-12 + +#define DE0 0.499426632161317606e-4 +#define DE1 0.139591210395547054e-2 +#define DE2 0.107665231109108629e-1 +#define DE3 0.325809818749873406e-1 +#define DE4 0.415222526655158363e-1 +#define DE5 0.186315628774716763e-1 + +#define NF0 -0.195436610112717345e-4 +#define NF1 -0.233315515113382977e-3 +#define NF2 -0.645380957611087587e-3 +#define NF3 -0.478948863920281252e-3 +#define NF4 -0.805234112224091742e-12 +#define NF5 0.246428598194879283e-13 + +#define DF0 0.822166621698664729e-4 +#define DF1 0.135346265620413852e-2 +#define DF2 0.602739242861830658e-2 +#define DF3 0.972227795510722956e-2 +#define DF4 0.510878800983771167e-2 + +#define NG0 -0.209689451648100728e-6 +#define NG1 -0.219252358028695992e-5 +#define NG2 -0.551641756327550939e-5 +#define NG3 -0.382300259826830258e-5 +#define NG4 -0.421182121910667329e-17 +#define NG5 0.492236019998237684e-19 + +#define DG0 0.889178444424237735e-6 +#define DG1 0.131152171690011152e-4 +#define DG2 0.537955850185616847e-4 +#define DG3 0.814966175170941864e-4 +#define DG4 0.407786943832260752e-4 + +#define NH0 -0.178284193496441400e-6 +#define NH1 -0.928734186616614974e-6 +#define NH2 -0.923318925566302615e-6 +#define NH3 -0.776417026702577552e-19 +#define NH4 0.290845644810826014e-21 + +#define DH0 0.786694697277890964e-6 +#define DH1 0.685435665630965488e-5 +#define DH2 0.153780175436788329e-4 +#define DH3 0.984873520613417917e-5 + +#define NI0 -0.538003743384069117e-10 +#define NI1 -0.273698654196756169e-9 +#define NI2 -0.268129826956403568e-9 +#define NI3 -0.804163374628432850e-29 + +#define DI0 0.238083376363471960e-9 +#define DI1 0.203579344621125934e-8 +#define DI2 0.450836980450693209e-8 +#define DI3 0.286005148753497156e-8 + +__attribute__((overloadable)) double +asinh(double x) +{ + const double rteps = 0x1.6a09e667f3bcdp-27; + const double recrteps = 0x1.6a09e667f3bcdp+26; + + // log2_lead and log2_tail sum to an extra-precise version of log(2) + const double log2_lead = 0x1.62e42ep-1; + const double log2_tail = 0x1.efa39ef35793cp-25; + + ulong ux = as_ulong(x); + ulong ax = ux & ~SIGNBIT_DP64; + double absx = as_double(ax); + + double t = x * x; + double pn, tn, pd, td; + + // XXX we are betting here that we can evaluate 8 pairs of + // polys faster than we can grab 12 coefficients from a table + // This also uses fewer registers + + // |x| >= 8 + pn = fma(t, fma(t, fma(t, NI3, NI2), NI1), NI0); + pd = fma(t, fma(t, fma(t, DI3, DI2), DI1), DI0); + + tn = fma(t, fma(t, fma(t, fma(t, NH4, NH3), NH2), NH1), NH0); + td = fma(t, fma(t, fma(t, DH3, DH2), DH1), DH0); + pn = absx < 8.0 ? tn : pn; + pd = absx < 8.0 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, fma(t, NG5, NG4), NG3), NG2), NG1), NG0); + td = fma(t, fma(t, fma(t, fma(t, DG4, DG3), DG2), DG1), DG0); + pn = absx < 4.0 ? tn : pn; + pd = absx < 4.0 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, fma(t, NF5, NF4), NF3), NF2), NF1), NF0); + td = fma(t, fma(t, fma(t, fma(t, DF4, DF3), DF2), DF1), DF0); + pn = absx < 2.0 ? tn : pn; + pd = absx < 2.0 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, fma(t, NE5, NE4), NE3), NE2), NE1), NE0); + td = fma(t, fma(t, fma(t, fma(t, fma(t, DE5, DE4), DE3), DE2), DE1), DE0); + pn = absx < 1.5 ? tn : pn; + pd = absx < 1.5 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, fma(t, ND5, ND4), ND3), ND2), ND1), ND0); + td = fma(t, fma(t, fma(t, fma(t, DD4, DD3), DD2), DD1), DD0); + pn = absx <= 1.0 ? tn : pn; + pd = absx <= 1.0 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, NC4, NC3), NC2), NC1), NC0); + td = fma(t, fma(t, fma(t, fma(t, DC4, DC3), DC2), DC1), DC0); + pn = absx < 0.75 ? tn : pn; + pd = absx < 0.75 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, NB4, NB3), NB2), NB1), NB0); + td = fma(t, fma(t, fma(t, fma(t, DB4, DB3), DB2), DB1), DB0); + pn = absx < 0.5 ? tn : pn; + pd = absx < 0.5 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, NA4, NA3), NA2), NA1), NA0); + td = fma(t, fma(t, fma(t, fma(t, DA4, DA3), DA2), DA1), DA0); + pn = absx < 0.25 ? tn : pn; + pd = absx < 0.25 ? td : pd; + + double pq = MATH_DIVIDE(pn, pd); + + // |x| <= 1 + double result1 = fma(absx*t, pq, absx); + + // Other ranges + int xout = absx <= 32.0 | absx > recrteps; + double y = absx + sqrt(fma(absx, absx, 1.0)); + y = xout ? absx : y; + + double r1, r2; + int xexp; + ep_log(y, &xexp, &r1, &r2); + + double dxexp = (double)(xexp + xout); + r1 = fma(dxexp, log2_lead, r1); + r2 = fma(dxexp, log2_tail, r2); + + // 1 < x <= 32 + double v2 = (pq + 0.25) / t; + double r = v2 + r1; + double s = ((r1 - r) + v2) + r2; + double v1 = r + s; + v2 = (r - v1) + s; + double result2 = v1 + v2; + + // x > 32 + double result3 = r1 + r2; + + double ret = absx > 1.0 ? result2 : result1; + ret = absx > 32.0 ? result3 : ret; + ret = x < 0.0 ? -ret : ret; + + // NaN, +-Inf, or x small enough that asinh(x) = x + ret = ax >= PINFBITPATT_DP64 | absx < rteps ? x : ret; + return ret; +} +
diff --git a/amd-builtins/math64/asinpiD.cl b/amd-builtins/math64/asinpiD.cl new file mode 100644 index 0000000..70bf22c --- /dev/null +++ b/amd-builtins/math64/asinpiD.cl
@@ -0,0 +1,97 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +asinpi(double x) +{ + // Computes arcsin(x). + // The argument is first reduced by noting that arcsin(x) + // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). + // For denormal and small arguments arcsin(x) = x to machine + // accuracy. Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arcsin(x) = x + x^3*R(x^2) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + const double pi = 0x1.921fb54442d18p+1; + const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */ + const double hpiby2_head = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */ + + double y = fabs(x); + int xneg = as_int2(x).hi < 0; + int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64; + + // abs(x) >= 0.5 + int transform = xexp >= -1; + + double rt = 0.5 * (1.0 - y); + double y2 = y * y; + double r = transform ? rt : y2; + + // Use a rational approximation for [0.0, 0.5] + double un = fma(r, + fma(r, + fma(r, + fma(r, + fma(r, 0.0000482901920344786991880522822991, + 0.00109242697235074662306043804220), + -0.0549989809235685841612020091328), + 0.275558175256937652532686256258), + -0.445017216867635649900123110649), + 0.227485835556935010735943483075); + + double ud = fma(r, + fma(r, + fma(r, + fma(r, 0.105869422087204370341222318533, + -0.943639137032492685763471240072), + 2.76568859157270989520376345954), + -3.28431505720958658909889444194), + 1.36491501334161032038194214209); + + double u = r * MATH_DIVIDE(un, ud); + + + // Reconstruct asin carefully in transformed region + double s = sqrt(r); + double sh = as_double(as_ulong(s) & 0xffffffff00000000UL); + double c = MATH_DIVIDE(fma(-sh, sh, r), s + sh); + double p = fma(2.0*s, u, -fma(-2.0, c, piby2_tail)); + double q = fma(-2.0, sh, hpiby2_head); + double vt = hpiby2_head - (p - q); + double v = fma(y, u, y); + v = transform ? vt : v; + + v = xexp < -28 ? y : v; + v = MATH_DIVIDE(v, pi); + v = xexp >= 0 ? as_double(QNANBITPATT_DP64) : v; + v = y == 1.0 ? 0.5 : v; + return xneg ? -v : v; +} +
diff --git a/amd-builtins/math64/atan2D.cl b/amd-builtins/math64/atan2D.cl new file mode 100644 index 0000000..5359780 --- /dev/null +++ b/amd-builtins/math64/atan2D.cl
@@ -0,0 +1,174 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline, weak)) double +atan2(double y, double x) +{ + USE_TABLE(double2, atan_jby256_tbl, ATAN_JBY256_TBL); + + const double pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ + const double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */ + const double piby4 = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */ + const double three_piby4 = 2.3561944901923449e+00; /* 0x4002d97c7f3321d2 */ + const double pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */ + const double pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */ + const double piby2_head = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */ + const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */ + + double x2 = x; + int xneg = as_int2(x).hi < 0; + int xexp = (as_int2(x).hi >> 20) & 0x7ff; + + double y2 = y; + int yneg = as_int2(y).hi < 0; + int yexp = (as_int2(y).hi >> 20) & 0x7ff; + + int cond2 = (xexp < 1021) & (yexp < 1021); + int diffexp = yexp - xexp; + + // Scale up both x and y if they are both below 1/4 + double x1 = ldexp(x, 1024); + int xexp1 = (as_int2(x1).hi >> 20) & 0x7ff; + double y1 = ldexp(y, 1024); + int yexp1 = (as_int2(y1).hi >> 20) & 0x7ff; + int diffexp1 = yexp1 - xexp1; + + diffexp = cond2 ? diffexp1 : diffexp; + x = cond2 ? x1 : x; + y = cond2 ? y1 : y; + + // General case: take absolute values of arguments + double u = fabs(x); + double v = fabs(y); + + // Swap u and v if necessary to obtain 0 < v < u. Compute v/u. + int swap_vu = u < v; + double uu = u; + u = swap_vu ? v : u; + v = swap_vu ? uu : v; + + double vbyu = v / u; + double q1, q2; + + // General values of v/u. Use a look-up table and series expansion. + + { + double val = vbyu > 0.0625 ? vbyu : 0.063; + int index = convert_int(fma(256.0, val, 0.5)); + double2 tv = atan_jby256_tbl[index - 16]; + q1 = tv.s0; + q2 = tv.s1; + double c = (double)index * 0x1.0p-8; + + // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 + // u_exponent could be EMAX so we have to do it in 2 steps + int m = -((int)(as_ulong(u) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); + //double um = __amdil_ldexp_f64(u, m); + //double vm = __amdil_ldexp_f64(v, m); + double um = ldexp(u, m); + double vm = ldexp(v, m); + + // 26 leading bits of u + double u1 = as_double(as_ulong(um) & 0xfffffffff8000000UL); + double u2 = um - u1; + + double r = MATH_DIVIDE(fma(-c, u2, fma(-c, u1, vm)), fma(c, vm, um)); + + // Polynomial approximation to atan(r) + double s = r * r; + q2 = q2 + fma((s * fma(-s, 0.19999918038989143496, 0.33333333333224095522)), -r, r); + } + + + double q3, q4; + { + q3 = 0.0; + q4 = vbyu; + } + + double q5, q6; + { + double u1 = as_double(as_ulong(u) & 0xffffffff00000000UL); + double u2 = u - u1; + double vu1 = as_double(as_ulong(vbyu) & 0xffffffff00000000UL); + double vu2 = vbyu - vu1; + + q5 = 0.0; + double s = vbyu * vbyu; + q6 = vbyu + fma(-vbyu * s, + fma(-s, + fma(-s, + fma(-s, + fma(-s, 0.90029810285449784439E-01, + 0.11110736283514525407), + 0.14285713561807169030), + 0.19999999999393223405), + 0.33333333333333170500), + MATH_DIVIDE(fma(-u, vu2, fma(-u2, vu1, fma(-u1, vu1, v))), u)); + } + + + q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5; + q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6; + + q1 = vbyu > 0.0625 ? q1 : q3; + q2 = vbyu > 0.0625 ? q2 : q4; + + // Tidy-up according to which quadrant the arguments lie in + double res1, res2, res3, res4; + q1 = swap_vu ? piby2_head - q1 : q1; + q2 = swap_vu ? piby2_tail - q2 : q2; + q1 = xneg ? pi_head - q1 : q1; + q2 = xneg ? pi_tail - q2 : q2; + q1 = q1 + q2; + res4 = yneg ? -q1 : q1; + + res1 = yneg ? -three_piby4 : three_piby4; + res2 = yneg ? -piby4 : piby4; + res3 = xneg ? res1 : res2; + + res3 = isinf(x2) & isinf(y2) ? res3 : res4; + res1 = yneg ? -pi : pi; + + // abs(x)/abs(y) > 2^56 and x < 0 + res3 = (diffexp < -56 && xneg) ? res1 : res3; + + res4 = MATH_DIVIDE(y, x); + // x positive and dominant over y by a factor of 2^28 + res3 = diffexp < -28 & xneg == 0 ? res4 : res3; + + // abs(y)/abs(x) > 2^56 + res4 = yneg ? -piby2 : piby2; // atan(y/x) is insignificant compared to piby2 + res3 = diffexp > 56 ? res4 : res3; + + res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y + res4 = xneg ? res1 : y2; + + res3 = y2 == 0.0 ? res4 : res3; // Zero y gives +-0 for positive x and +-pi for negative x + res3 = isnan(y2) ? y2 : res3; + res3 = isnan(x2) ? x2 : res3; + + return res3; +} +
diff --git a/amd-builtins/math64/atan2D_table.h b/amd-builtins/math64/atan2D_table.h new file mode 100644 index 0000000..c37177c --- /dev/null +++ b/amd-builtins/math64/atan2D_table.h
@@ -0,0 +1,272 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// Arrays atan_jby256_lead and atan_jby256_tail contain +// leading and trailing parts respectively of precomputed +// values of atan(j/256), for j = 16, 17, ..., 256. +// atan_jby256_lead contains the first 21 bits of precision, +// and atan_jby256_tail contains a further 53 bits precision. + +DECLARE_TABLE(double2, ATAN_JBY256_TBL, 241, + (double2)(0x1.ff55b00000000p-5, 0x1.6e59fbd38db2cp-26), + (double2)(0x1.0f99e00000000p-4, 0x1.4e3aa54dedf96p-25), + (double2)(0x1.1f86d00000000p-4, 0x1.7e105ab1bda88p-25), + (double2)(0x1.2f71900000000p-4, 0x1.8c5254d013fd0p-27), + (double2)(0x1.3f59f00000000p-4, 0x1.cf8ab3ad62670p-29), + (double2)(0x1.4f3fd00000000p-4, 0x1.9dca4bec80468p-26), + (double2)(0x1.5f23200000000p-4, 0x1.3f4b5ec98a8dap-26), + (double2)(0x1.6f03b00000000p-4, 0x1.b9d49619d81fep-25), + (double2)(0x1.7ee1800000000p-4, 0x1.3017887460934p-27), + (double2)(0x1.8ebc500000000p-4, 0x1.11e3eca0b9944p-26), + (double2)(0x1.9e94100000000p-4, 0x1.4f3f73c5a332ep-26), + (double2)(0x1.ae68a00000000p-4, 0x1.c71c8ae0e00a6p-26), + (double2)(0x1.be39e00000000p-4, 0x1.7cde0f86fbdc7p-25), + (double2)(0x1.ce07c00000000p-4, 0x1.70f328c889c72p-26), + (double2)(0x1.ddd2100000000p-4, 0x1.c07ae9b994efep-26), + (double2)(0x1.ed98c00000000p-4, 0x1.0c8021d7b1698p-27), + (double2)(0x1.fd5ba00000000p-4, 0x1.35585edb8cb22p-25), + (double2)(0x1.068d500000000p-3, 0x1.0842567b30e96p-24), + (double2)(0x1.0e6ad00000000p-3, 0x1.99e811031472ep-24), + (double2)(0x1.1646500000000p-3, 0x1.041821416bceep-25), + (double2)(0x1.1e1fa00000000p-3, 0x1.f6086e4dc96f4p-24), + (double2)(0x1.25f6e00000000p-3, 0x1.71a535c5f1b58p-27), + (double2)(0x1.2dcbd00000000p-3, 0x1.65f743fe63ca1p-24), + (double2)(0x1.359e800000000p-3, 0x1.dbd733472d014p-24), + (double2)(0x1.3d6ee00000000p-3, 0x1.d18cc4d8b0d1dp-24), + (double2)(0x1.453ce00000000p-3, 0x1.8c12553c8fb29p-24), + (double2)(0x1.4d08700000000p-3, 0x1.53b49e2e8f991p-24), + (double2)(0x1.54d1800000000p-3, 0x1.7422ae148c141p-24), + (double2)(0x1.5c98100000000p-3, 0x1.e3ec269df56a8p-27), + (double2)(0x1.645bf00000000p-3, 0x1.ff6754e7e0ac9p-24), + (double2)(0x1.6c1d400000000p-3, 0x1.131267b1b5aadp-24), + (double2)(0x1.73dbd00000000p-3, 0x1.d14fa403a94bcp-24), + (double2)(0x1.7b97b00000000p-3, 0x1.2f396c089a3d8p-25), + (double2)(0x1.8350b00000000p-3, 0x1.c731d78fa95bbp-24), + (double2)(0x1.8b06e00000000p-3, 0x1.c50f385177399p-24), + (double2)(0x1.92ba300000000p-3, 0x1.f41409c6f2c20p-25), + (double2)(0x1.9a6a800000000p-3, 0x1.d2d90c4c39ec0p-24), + (double2)(0x1.a217e00000000p-3, 0x1.80420696f2106p-25), + (double2)(0x1.a9c2300000000p-3, 0x1.b40327943a2e8p-27), + (double2)(0x1.b169600000000p-3, 0x1.5d35e02f3d2a2p-25), + (double2)(0x1.b90d700000000p-3, 0x1.4a498288117b0p-25), + (double2)(0x1.c0ae500000000p-3, 0x1.35da119afb324p-25), + (double2)(0x1.c84bf00000000p-3, 0x1.14e85cdb9a908p-24), + (double2)(0x1.cfe6500000000p-3, 0x1.38754e5547b9ap-25), + (double2)(0x1.d77d500000000p-3, 0x1.be40ae6ce3246p-24), + (double2)(0x1.df11000000000p-3, 0x1.0c993b3bea7e7p-24), + (double2)(0x1.e6a1400000000p-3, 0x1.1d2dd89ac3359p-24), + (double2)(0x1.ee2e100000000p-3, 0x1.1476603332c46p-25), + (double2)(0x1.f5b7500000000p-3, 0x1.f25901bac55b7p-24), + (double2)(0x1.fd3d100000000p-3, 0x1.f881b7c826e28p-24), + (double2)(0x1.025fa00000000p-2, 0x1.441996d698d20p-24), + (double2)(0x1.061ee00000000p-2, 0x1.407ac521ea089p-23), + (double2)(0x1.09dc500000000p-2, 0x1.2fb0c6c4b1723p-23), + (double2)(0x1.0d97e00000000p-2, 0x1.ca135966a3e18p-23), + (double2)(0x1.1151a00000000p-2, 0x1.b1218e4d646e4p-25), + (double2)(0x1.1509700000000p-2, 0x1.d4e72a350d288p-25), + (double2)(0x1.18bf500000000p-2, 0x1.4617e2f04c329p-23), + (double2)(0x1.1c73500000000p-2, 0x1.096ec41e82650p-25), + (double2)(0x1.2025500000000p-2, 0x1.9f91f25773e6ep-24), + (double2)(0x1.23d5600000000p-2, 0x1.59c0820f1d674p-25), + (double2)(0x1.2783700000000p-2, 0x1.02bf7a2df1064p-25), + (double2)(0x1.2b2f700000000p-2, 0x1.fb36bfc40508fp-23), + (double2)(0x1.2ed9800000000p-2, 0x1.ea08f3f8dc892p-24), + (double2)(0x1.3281800000000p-2, 0x1.3ed6254656a0ep-24), + (double2)(0x1.3627700000000p-2, 0x1.b83f5e5e69c58p-25), + (double2)(0x1.39cb400000000p-2, 0x1.d6ec2af768592p-23), + (double2)(0x1.3d6d100000000p-2, 0x1.493889a226f94p-25), + (double2)(0x1.410cb00000000p-2, 0x1.5ad8fa65279bap-23), + (double2)(0x1.44aa400000000p-2, 0x1.b615784d45434p-25), + (double2)(0x1.4845a00000000p-2, 0x1.09a184368f145p-23), + (double2)(0x1.4bdee00000000p-2, 0x1.61a2439b0d91cp-24), + (double2)(0x1.4f75f00000000p-2, 0x1.ce1a65e39a978p-24), + (double2)(0x1.530ad00000000p-2, 0x1.32a39a93b6a66p-23), + (double2)(0x1.569d800000000p-2, 0x1.1c3699af804e7p-23), + (double2)(0x1.5a2e000000000p-2, 0x1.75e0f4e44ede8p-26), + (double2)(0x1.5dbc300000000p-2, 0x1.f77ced1a7a83bp-23), + (double2)(0x1.6148400000000p-2, 0x1.84e7f0cb1b500p-29), + (double2)(0x1.64d1f00000000p-2, 0x1.ec6b838b02dfep-23), + (double2)(0x1.6859700000000p-2, 0x1.3ebf4dfbeda87p-23), + (double2)(0x1.6bdea00000000p-2, 0x1.9397aed9cb475p-23), + (double2)(0x1.6f61900000000p-2, 0x1.07937bc239c54p-24), + (double2)(0x1.72e2200000000p-2, 0x1.aa754553131b6p-23), + (double2)(0x1.7660700000000p-2, 0x1.4a05d407c45dcp-24), + (double2)(0x1.79dc600000000p-2, 0x1.132231a206dd0p-23), + (double2)(0x1.7d56000000000p-2, 0x1.2d8ecfdd69c88p-24), + (double2)(0x1.80cd400000000p-2, 0x1.a852c74218606p-24), + (double2)(0x1.8442200000000p-2, 0x1.71bf2baeebb50p-23), + (double2)(0x1.87b4b00000000p-2, 0x1.83d7db7491820p-27), + (double2)(0x1.8b24d00000000p-2, 0x1.ca50d92b6da14p-25), + (double2)(0x1.8e92900000000p-2, 0x1.6f5cde8530298p-26), + (double2)(0x1.91fde00000000p-2, 0x1.f343198910740p-24), + (double2)(0x1.9566d00000000p-2, 0x1.0e8d241ccd80ap-24), + (double2)(0x1.98cd500000000p-2, 0x1.1535ac619e6c8p-24), + (double2)(0x1.9c31600000000p-2, 0x1.7316041c36cd2p-24), + (double2)(0x1.9f93000000000p-2, 0x1.985a000637d8ep-24), + (double2)(0x1.a2f2300000000p-2, 0x1.f2f29858c0a68p-25), + (double2)(0x1.a64ee00000000p-2, 0x1.879847f96d909p-23), + (double2)(0x1.a9a9200000000p-2, 0x1.ab3d319e12e42p-23), + (double2)(0x1.ad00f00000000p-2, 0x1.5088162dfc4c2p-24), + (double2)(0x1.b056400000000p-2, 0x1.05749a1cd9d8cp-25), + (double2)(0x1.b3a9100000000p-2, 0x1.da65c6c6b8618p-26), + (double2)(0x1.b6f9600000000p-2, 0x1.739bf7df1ad64p-25), + (double2)(0x1.ba47300000000p-2, 0x1.bc31252aa3340p-25), + (double2)(0x1.bd92800000000p-2, 0x1.e528191ad3aa8p-26), + (double2)(0x1.c0db400000000p-2, 0x1.929d93df19f18p-23), + (double2)(0x1.c421900000000p-2, 0x1.ff11eb693a080p-26), + (double2)(0x1.c765500000000p-2, 0x1.55ae3f145a3a0p-27), + (double2)(0x1.caa6800000000p-2, 0x1.cbcd8c6c0ca82p-24), + (double2)(0x1.cde5300000000p-2, 0x1.0cb04d425d304p-24), + (double2)(0x1.d121500000000p-2, 0x1.9adfcab5be678p-24), + (double2)(0x1.d45ae00000000p-2, 0x1.93d90c5662508p-23), + (double2)(0x1.d791f00000000p-2, 0x1.68489bd35ff40p-24), + (double2)(0x1.dac6700000000p-2, 0x1.586ed3da2b7e0p-28), + (double2)(0x1.ddf8500000000p-2, 0x1.7604d2e850eeep-23), + (double2)(0x1.e127b00000000p-2, 0x1.ac1d12bfb53d8p-24), + (double2)(0x1.e454800000000p-2, 0x1.9b3d468274740p-28), + (double2)(0x1.e77eb00000000p-2, 0x1.fc5d68d10e53cp-24), + (double2)(0x1.eaa6500000000p-2, 0x1.8f9e51884becbp-23), + (double2)(0x1.edcb600000000p-2, 0x1.a87f0869c06d1p-23), + (double2)(0x1.f0ede00000000p-2, 0x1.31e7279f685fap-23), + (double2)(0x1.f40dd00000000p-2, 0x1.6a8282f9719b0p-27), + (double2)(0x1.f72b200000000p-2, 0x1.0d2724a8a44e0p-25), + (double2)(0x1.fa45d00000000p-2, 0x1.a60524b11ad4ep-23), + (double2)(0x1.fd5e000000000p-2, 0x1.75fdf832750f0p-26), + (double2)(0x1.0039c00000000p-1, 0x1.cf06902e4cd36p-23), + (double2)(0x1.01c3400000000p-1, 0x1.e82422d4f6d10p-25), + (double2)(0x1.034b700000000p-1, 0x1.24a091063e6c0p-26), + (double2)(0x1.04d2500000000p-1, 0x1.8a1a172dc6f38p-24), + (double2)(0x1.0657e00000000p-1, 0x1.29b6619f8a92dp-22), + (double2)(0x1.07dc300000000p-1, 0x1.9274d9c1b70c8p-24), + (double2)(0x1.095f300000000p-1, 0x1.0c34b1fbb7930p-26), + (double2)(0x1.0ae0e00000000p-1, 0x1.639866c20eb50p-25), + (double2)(0x1.0c61400000000p-1, 0x1.6d6d0f6832e9ep-23), + (double2)(0x1.0de0500000000p-1, 0x1.af54def99f25ep-22), + (double2)(0x1.0f5e200000000p-1, 0x1.16cfc52a00262p-22), + (double2)(0x1.10daa00000000p-1, 0x1.dcc1e83569c32p-23), + (double2)(0x1.1255d00000000p-1, 0x1.37f7a551ed425p-22), + (double2)(0x1.13cfb00000000p-1, 0x1.f6360adc98887p-22), + (double2)(0x1.1548500000000p-1, 0x1.2c6ec8d35a2c1p-22), + (double2)(0x1.16bfa00000000p-1, 0x1.bd44df84cb036p-23), + (double2)(0x1.1835a00000000p-1, 0x1.117cf826e310ep-22), + (double2)(0x1.19aa500000000p-1, 0x1.ca533f332cfc9p-22), + (double2)(0x1.1b1dc00000000p-1, 0x1.0f208509dbc2ep-22), + (double2)(0x1.1c8fe00000000p-1, 0x1.cd07d93c945dep-23), + (double2)(0x1.1e00b00000000p-1, 0x1.57bdfd67e6d72p-22), + (double2)(0x1.1f70400000000p-1, 0x1.aab89c516c658p-24), + (double2)(0x1.20de800000000p-1, 0x1.3e823b1a1b8a0p-25), + (double2)(0x1.224b700000000p-1, 0x1.307464a9d6d3cp-23), + (double2)(0x1.23b7100000000p-1, 0x1.c5993cd438843p-22), + (double2)(0x1.2521700000000p-1, 0x1.ba2fca02ab554p-22), + (double2)(0x1.268a900000000p-1, 0x1.01a5b6983a268p-23), + (double2)(0x1.27f2600000000p-1, 0x1.273d1b350efc8p-25), + (double2)(0x1.2958e00000000p-1, 0x1.64c238c37b0c6p-23), + (double2)(0x1.2abe200000000p-1, 0x1.aded07370a300p-25), + (double2)(0x1.2c22100000000p-1, 0x1.78091197eb47ep-23), + (double2)(0x1.2d84c00000000p-1, 0x1.4b0f245e0dabcp-24), + (double2)(0x1.2ee6200000000p-1, 0x1.080d9794e2eafp-22), + (double2)(0x1.3046400000000p-1, 0x1.d4ec242b60c76p-23), + (double2)(0x1.31a5200000000p-1, 0x1.221d2f940caa0p-27), + (double2)(0x1.3302b00000000p-1, 0x1.cdbc42b2bba5cp-24), + (double2)(0x1.345f000000000p-1, 0x1.cce37bb440840p-25), + (double2)(0x1.35ba000000000p-1, 0x1.6c1d999cf1dd0p-22), + (double2)(0x1.3713d00000000p-1, 0x1.bed8a07eb0870p-26), + (double2)(0x1.386c500000000p-1, 0x1.69ed88f490e3cp-24), + (double2)(0x1.39c3900000000p-1, 0x1.cd41719b73ef0p-25), + (double2)(0x1.3b19800000000p-1, 0x1.cbc4ac95b41b7p-22), + (double2)(0x1.3c6e400000000p-1, 0x1.238f1b890f5d7p-22), + (double2)(0x1.3dc1c00000000p-1, 0x1.50c4282259cc4p-24), + (double2)(0x1.3f13f00000000p-1, 0x1.713d2de87b3e2p-22), + (double2)(0x1.4064f00000000p-1, 0x1.1d5a7d2255276p-23), + (double2)(0x1.41b4a00000000p-1, 0x1.c0dfd48227ac1p-22), + (double2)(0x1.4303200000000p-1, 0x1.1c964dab76753p-22), + (double2)(0x1.4450600000000p-1, 0x1.6de56d5704496p-23), + (double2)(0x1.459c600000000p-1, 0x1.4aeb71fd19968p-23), + (double2)(0x1.46e7200000000p-1, 0x1.fbf91c57b1918p-23), + (double2)(0x1.4830a00000000p-1, 0x1.d6bef7fbe5d9ap-22), + (double2)(0x1.4978f00000000p-1, 0x1.464d3dc249066p-22), + (double2)(0x1.4ac0000000000p-1, 0x1.638e2ec4d9073p-22), + (double2)(0x1.4c05e00000000p-1, 0x1.16f4a7247ea7cp-24), + (double2)(0x1.4d4a800000000p-1, 0x1.1a0a740f1d440p-28), + (double2)(0x1.4e8de00000000p-1, 0x1.6edbb0114a33cp-23), + (double2)(0x1.4fd0100000000p-1, 0x1.dbee8bf1d513cp-24), + (double2)(0x1.5111000000000p-1, 0x1.5b8bdb0248f73p-22), + (double2)(0x1.5250c00000000p-1, 0x1.7de3d3f5eac64p-22), + (double2)(0x1.538f500000000p-1, 0x1.ee24187ae448ap-23), + (double2)(0x1.54cca00000000p-1, 0x1.e06c591ec5192p-22), + (double2)(0x1.5608d00000000p-1, 0x1.4e3861a332738p-24), + (double2)(0x1.5743c00000000p-1, 0x1.a9599dcc2bfe4p-24), + (double2)(0x1.587d800000000p-1, 0x1.f732fbad43468p-25), + (double2)(0x1.59b6000000000p-1, 0x1.eb9f573b727d9p-22), + (double2)(0x1.5aed600000000p-1, 0x1.8b212a2eb9897p-22), + (double2)(0x1.5c23900000000p-1, 0x1.384884c167215p-22), + (double2)(0x1.5d58900000000p-1, 0x1.0e2d363020051p-22), + (double2)(0x1.5e8c600000000p-1, 0x1.2820879fbd022p-22), + (double2)(0x1.5fbf000000000p-1, 0x1.a1ab9893e4b30p-22), + (double2)(0x1.60f0800000000p-1, 0x1.2d1b817a24478p-23), + (double2)(0x1.6220d00000000p-1, 0x1.15d7b8ded4878p-25), + (double2)(0x1.634ff00000000p-1, 0x1.8968f9db3a5e4p-24), + (double2)(0x1.647de00000000p-1, 0x1.71c4171fe135fp-22), + (double2)(0x1.65aab00000000p-1, 0x1.6d80f605d0d8cp-22), + (double2)(0x1.66d6600000000p-1, 0x1.c91f043691590p-24), + (double2)(0x1.6800e00000000p-1, 0x1.39f8a15fce2b2p-23), + (double2)(0x1.692a400000000p-1, 0x1.55beda9d94b80p-27), + (double2)(0x1.6a52700000000p-1, 0x1.b12c15d60949ap-23), + (double2)(0x1.6b79800000000p-1, 0x1.24167b312bfe3p-22), + (double2)(0x1.6c9f700000000p-1, 0x1.0ab8633070277p-22), + (double2)(0x1.6dc4400000000p-1, 0x1.54554ebbc80eep-23), + (double2)(0x1.6ee7f00000000p-1, 0x1.0204aef5a4bb8p-25), + (double2)(0x1.700a700000000p-1, 0x1.8af08c679cf2cp-22), + (double2)(0x1.712be00000000p-1, 0x1.0852a330ae6c8p-22), + (double2)(0x1.724c300000000p-1, 0x1.6d3eb9ec32916p-23), + (double2)(0x1.736b600000000p-1, 0x1.685cb7fcbbafep-23), + (double2)(0x1.7489700000000p-1, 0x1.1f751c1e0bd95p-22), + (double2)(0x1.75a6700000000p-1, 0x1.705b1b0f72560p-26), + (double2)(0x1.76c2400000000p-1, 0x1.b98d8d808ca92p-22), + (double2)(0x1.77dd100000000p-1, 0x1.2ea22c75cc980p-25), + (double2)(0x1.78f6b00000000p-1, 0x1.7aba62bca0350p-22), + (double2)(0x1.7a0f400000000p-1, 0x1.d73833442278cp-22), + (double2)(0x1.7b26c00000000p-1, 0x1.5a5ca1fb18bf9p-22), + (double2)(0x1.7c3d300000000p-1, 0x1.1a6092b6ecf28p-25), + (double2)(0x1.7d52800000000p-1, 0x1.44fd049aac104p-24), + (double2)(0x1.7e66c00000000p-1, 0x1.c114fd8df5180p-29), + (double2)(0x1.7f79e00000000p-1, 0x1.5972f130feae5p-22), + (double2)(0x1.808c000000000p-1, 0x1.ca034a55fe198p-24), + (double2)(0x1.819d000000000p-1, 0x1.6e2b149990227p-22), + (double2)(0x1.82ad000000000p-1, 0x1.b00000294592cp-24), + (double2)(0x1.83bbe00000000p-1, 0x1.8b9bdc442620ep-22), + (double2)(0x1.84c9c00000000p-1, 0x1.d94fdfabf3e4ep-23), + (double2)(0x1.85d6900000000p-1, 0x1.5db30b145ad9ap-23), + (double2)(0x1.86e2500000000p-1, 0x1.e3e1eb95022b0p-23), + (double2)(0x1.87ed000000000p-1, 0x1.d5b8b45442bd6p-22), + (double2)(0x1.88f6b00000000p-1, 0x1.7a046231ecd2ep-22), + (double2)(0x1.89ff500000000p-1, 0x1.feafe3ef55232p-22), + (double2)(0x1.8b06f00000000p-1, 0x1.839e7bfd78267p-22), + (double2)(0x1.8c0d900000000p-1, 0x1.45cf49d6fa900p-25), + (double2)(0x1.8d13200000000p-1, 0x1.be3132b27f380p-27), + (double2)(0x1.8e17a00000000p-1, 0x1.533980bb84f9fp-22), + (double2)(0x1.8f1b300000000p-1, 0x1.889e2ce3ba390p-26), + (double2)(0x1.901db00000000p-1, 0x1.f7778c3ad0cc8p-24), + (double2)(0x1.911f300000000p-1, 0x1.46660cec4eba2p-23), + (double2)(0x1.921fb00000000p-1, 0x1.5110b4611a626p-23), +) +
diff --git a/amd-builtins/math64/atan2piD.cl b/amd-builtins/math64/atan2piD.cl new file mode 100644 index 0000000..9f3f026 --- /dev/null +++ b/amd-builtins/math64/atan2piD.cl
@@ -0,0 +1,168 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline, weak)) double +atan2pi(double y, double x) +{ + USE_TABLE(double2, atan_jby256_tbl, ATAN_JBY256_TBL); + + const double pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ + const double pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */ + const double pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */ + const double piby2_head = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */ + const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */ + + double x2 = x; + int xneg = as_int2(x).hi < 0; + int xexp = (as_int2(x).hi >> 20) & 0x7ff; + + double y2 = y; + int yneg = as_int2(y).hi < 0; + int yexp = (as_int2(y).hi >> 20) & 0x7ff; + + int cond2 = (xexp < 1021) & (yexp < 1021); + int diffexp = yexp - xexp; + + // Scale up both x and y if they are both below 1/4 + double x1 = ldexp(x, 1024); + int xexp1 = (as_int2(x1).hi >> 20) & 0x7ff; + double y1 = ldexp(y, 1024); + int yexp1 = (as_int2(y1).hi >> 20) & 0x7ff; + int diffexp1 = yexp1 - xexp1; + + diffexp = cond2 ? diffexp1 : diffexp; + x = cond2 ? x1 : x; + y = cond2 ? y1 : y; + + // General case: take absolute values of arguments + double u = fabs(x); + double v = fabs(y); + + // Swap u and v if necessary to obtain 0 < v < u. Compute v/u. + int swap_vu = u < v; + double uu = u; + u = swap_vu ? v : u; + v = swap_vu ? uu : v; + + double vbyu = v / u; + double q1, q2; + + // General values of v/u. Use a look-up table and series expansion. + + { + double val = vbyu > 0.0625 ? vbyu : 0.063; + int index = convert_int(fma(256.0, val, 0.5)); + double2 tv = atan_jby256_tbl[index - 16]; + q1 = tv.s0; + q2 = tv.s1; + double c = (double)index * 0x1.0p-8; + + // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 + // u_exponent could be EMAX so we have to do it in 2 steps + int m = -((int)(as_ulong(u) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); + double um = ldexp(u, m); + double vm = ldexp(v, m); + + // 26 leading bits of u + double u1 = as_double(as_ulong(um) & 0xfffffffff8000000UL); + double u2 = um - u1; + + double r = MATH_DIVIDE(fma(-c, u2, fma(-c, u1, vm)), fma(c, vm, um)); + + // Polynomial approximation to atan(r) + double s = r * r; + q2 = q2 + fma((s * fma(-s, 0.19999918038989143496, 0.33333333333224095522)), -r, r); + } + + + double q3, q4; + { + q3 = 0.0; + q4 = vbyu; + } + + double q5, q6; + { + double u1 = as_double(as_ulong(u) & 0xffffffff00000000UL); + double u2 = u - u1; + double vu1 = as_double(as_ulong(vbyu) & 0xffffffff00000000UL); + double vu2 = vbyu - vu1; + + q5 = 0.0; + double s = vbyu * vbyu; + q6 = vbyu + fma(-vbyu * s, + fma(-s, + fma(-s, + fma(-s, + fma(-s, 0.90029810285449784439E-01, + 0.11110736283514525407), + 0.14285713561807169030), + 0.19999999999393223405), + 0.33333333333333170500), + MATH_DIVIDE(fma(-u, vu2, fma(-u2, vu1, fma(-u1, vu1, v))), u)); + } + + + q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5; + q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6; + + q1 = vbyu > 0.0625 ? q1 : q3; + q2 = vbyu > 0.0625 ? q2 : q4; + + // Tidy-up according to which quadrant the arguments lie in + double res1, res2, res3, res4; + q1 = swap_vu ? piby2_head - q1 : q1; + q2 = swap_vu ? piby2_tail - q2 : q2; + q1 = xneg ? pi_head - q1 : q1; + q2 = xneg ? pi_tail - q2 : q2; + q1 = MATH_DIVIDE(q1 + q2, pi); + res4 = yneg ? -q1 : q1; + + res1 = yneg ? -0.75 : 0.75; + res2 = yneg ? -0.25 : 0.25; + res3 = xneg ? res1 : res2; + + res3 = isinf(y2) & isinf(x2) ? res3 : res4; + res1 = yneg ? -1.0 : 1.0; + + // abs(x)/abs(y) > 2^56 and x < 0 + res3 = (diffexp < -56 && xneg) ? res1 : res3; + + res4 = MATH_DIVIDE(MATH_DIVIDE(y, x), pi); + // x positive and dominant over y by a factor of 2^28 + res3 = diffexp < -28 & xneg == 0 ? res4 : res3; + + // abs(y)/abs(x) > 2^56 + res4 = yneg ? -0.5 : 0.5; // atan(y/x) is insignificant compared to piby2 + res3 = diffexp > 56 ? res4 : res3; + + res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y + res4 = xneg ? res1 : y2; + + res3 = y2 == 0.0 ? res4 : res3; // Zero y gives +-0 for positive x and +-pi for negative x + res3 = isnan(y2) ? y2 : res3; + res3 = isnan(x2) ? x2 : res3; + + return res3; +}
diff --git a/amd-builtins/math64/atanD.cl b/amd-builtins/math64/atanD.cl new file mode 100644 index 0000000..0e803c7 --- /dev/null +++ b/amd-builtins/math64/atanD.cl
@@ -0,0 +1,101 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +atan(double x) +{ + const double piby2 = 1.5707963267948966e+00; // 0x3ff921fb54442d18 + + double v = fabs(x); + + // 2^56 > v > 39/16 + double a = -1.0; + double b = v; + // (chi + clo) = arctan(infinity) + double chi = 1.57079632679489655800e+00; + double clo = 6.12323399573676480327e-17; + + double ta = v - 1.5; + double tb = 1.0 + 1.5 * v; + int l = v <= 0x1.38p+1; // 39/16 > v > 19/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(1.5) + chi = l ? 9.82793723247329054082e-01 : chi; + clo = l ? 1.39033110312309953701e-17 : clo; + + ta = v - 1.0; + tb = 1.0 + v; + l = v <= 0x1.3p+0; // 19/16 > v > 11/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(1.) + chi = l ? 7.85398163397448278999e-01 : chi; + clo = l ? 3.06161699786838240164e-17 : clo; + + ta = 2.0 * v - 1.0; + tb = 2.0 + v; + l = v <= 0x1.6p-1; // 11/16 > v > 7/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(0.5) + chi = l ? 4.63647609000806093515e-01 : chi; + clo = l ? 2.26987774529616809294e-17 : clo; + + l = v <= 0x1.cp-2; // v < 7/16 + a = l ? v : a; + b = l ? 1.0 : b;; + chi = l ? 0.0 : chi; + clo = l ? 0.0 : clo; + + // Core approximation: Remez(4,4) on [-7/16,7/16] + double r = a / b; + double s = r * r; + double qn = fma(s, + fma(s, + fma(s, + fma(s, 0.142316903342317766e-3, + 0.304455919504853031e-1), + 0.220638780716667420e0), + 0.447677206805497472e0), + 0.268297920532545909e0); + + double qd = fma(s, + fma(s, + fma(s, + fma(s, 0.389525873944742195e-1, + 0.424602594203847109e0), + 0.141254259931958921e1), + 0.182596787737507063e1), + 0.804893761597637733e0); + + double q = r * s * qn / qd; + r = chi - ((q - clo) - r); + + double z = isnan(x) ? x : piby2; + z = v <= 0x1.0p+56 ? r : z; + z = v < 0x1.0p-26 ? v : z; + return x == v ? z : -z; +} +
diff --git a/amd-builtins/math64/atanhD.cl b/amd-builtins/math64/atanhD.cl new file mode 100644 index 0000000..a362100 --- /dev/null +++ b/amd-builtins/math64/atanhD.cl
@@ -0,0 +1,71 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +atanh(double x) +{ + double absx = fabs(x); + + double ret = absx == 1.0 ? as_double(PINFBITPATT_DP64) : as_double(QNANBITPATT_DP64); + + // |x| >= 0.5 + // Note that atanh(x) = 0.5 * ln((1+x)/(1-x)) + // For greater accuracy we use + // ln((1+x)/(1-x)) = ln(1 + 2x/(1-x)) = log1p(2x/(1-x)). + double r = 0.5 * log1p(2.0 * absx / (1.0 - absx)); + ret = absx < 1.0 ? r : ret; + + r = -ret; + ret = x < 0.0 ? r : ret; + + // Arguments up to 0.5 in magnitude are + // approximated by a [5,5] minimax polynomial + double t = x * x; + + double pn = fma(t, + fma(t, + fma(t, + fma(t, + fma(t, -0.10468158892753136958e-3, 0.28728638600548514553e-1), + -0.28180210961780814148e0), + 0.88468142536501647470e0), + -0.11028356797846341457e1), + 0.47482573589747356373e0); + + double pd = fma(t, + fma(t, + fma(t, + fma(t, + fma(t, -0.35861554370169537512e-1, 0.49561196555503101989e0), + -0.22608883748988489342e1), + 0.45414700626084508355e1), + -0.41631933639693546274e1), + 0.14244772076924206909e1); + + r = fma(x*t, pn/pd, x); + ret = absx < 0.5 ? r : ret; + + return ret; +} +
diff --git a/amd-builtins/math64/atanpiD.cl b/amd-builtins/math64/atanpiD.cl new file mode 100644 index 0000000..0ce71e6 --- /dev/null +++ b/amd-builtins/math64/atanpiD.cl
@@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +atanpi(double x) +{ + const double pi = 0x1.921fb54442d18p+1; + + double v = fabs(x); + + // 2^56 > v > 39/16 + double a = -1.0; + double b = v; + // (chi + clo) = arctan(infinity) + double chi = 1.57079632679489655800e+00; + double clo = 6.12323399573676480327e-17; + + double ta = v - 1.5; + double tb = 1.0 + 1.5 * v; + int l = v <= 0x1.38p+1; // 39/16 > v > 19/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(1.5) + chi = l ? 9.82793723247329054082e-01 : chi; + clo = l ? 1.39033110312309953701e-17 : clo; + + ta = v - 1.0; + tb = 1.0 + v; + l = v <= 0x1.3p+0; // 19/16 > v > 11/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(1.) + chi = l ? 7.85398163397448278999e-01 : chi; + clo = l ? 3.06161699786838240164e-17 : clo; + + ta = 2.0 * v - 1.0; + tb = 2.0 + v; + l = v <= 0x1.6p-1; // 11/16 > v > 7/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(0.5) + chi = l ? 4.63647609000806093515e-01 : chi; + clo = l ? 2.26987774529616809294e-17 : clo; + + l = v <= 0x1.cp-2; // v < 7/16 + a = l ? v : a; + b = l ? 1.0 : b;; + chi = l ? 0.0 : chi; + clo = l ? 0.0 : clo; + + // Core approximation: Remez(4,4) on [-7/16,7/16] + double r = a / b; + double s = r * r; + double qn = fma(s, + fma(s, + fma(s, + fma(s, 0.142316903342317766e-3, + 0.304455919504853031e-1), + 0.220638780716667420e0), + 0.447677206805497472e0), + 0.268297920532545909e0); + + double qd = fma(s, + fma(s, + fma(s, + fma(s, 0.389525873944742195e-1, + 0.424602594203847109e0), + 0.141254259931958921e1), + 0.182596787737507063e1), + 0.804893761597637733e0); + + double q = r * s * qn / qd; + r = (chi - ((q - clo) - r)) / pi; + double vp = v / pi; + + double z = isnan(x) ? x : 0.5; + z = v <= 0x1.0p+56 ? r : z; + z = v < 0x1.0p-26 ? vp : z; + return x == v ? z : -z; +} +
diff --git a/amd-builtins/math64/cbrtD.cl b/amd-builtins/math64/cbrtD.cl new file mode 100644 index 0000000..9f0c688 --- /dev/null +++ b/amd-builtins/math64/cbrtD.cl
@@ -0,0 +1,115 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +// Algorithm: +// +// x = (2^m)*A +// x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8)) +// x = (2^m)*2*(G/2+g/2) +// x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9)) +// +// Y = (2^(-1))*(2^(-m))*(2^m)*A +// Now, range of Y is: 0.5 <= Y < 1 +// +// F = 0x100 + (first 7 mantissa bits) + (8th mantissa bit) +// Now, range of F is: 128 <= F <= 256 +// F = F / 256 +// Now, range of F is: 0.5 <= F <= 1 +// +// f = (Y-F), with (f <= 2^(-9)) +// +// cbrt(x) = cbrt(2^m) * cbrt(2) * cbrt(F+f) +// cbrt(x) = cbrt(2^m) * cbrt(2) * cbrt(F) + cbrt(1+(f/F)) +// cbrt(x) = cbrt(2^m) * cbrt(2*F) * cbrt(1+r) +// +// r = (f/F), with (r <= 2^(-8)) +// r = f*(1/F) with (1/F) precomputed to avoid division +// +// cbrt(x) = cbrt(2^m) * cbrt(G) * (1+poly) +// +// poly = c1*r + c2*(r^2) + c3*(r^3) + c4*(r^4) + c5*(r^5) + c6*(r^6) + + +__attribute__((overloadable)) double +cbrt(double x) +{ + USE_TABLE(double, p_inv, CBRT_TBL_INV); + USE_TABLE(double2, p_cbrt, CBRT_TBL); + USE_TABLE(double2, p_rem, CBRT_TBL_REM); + + + int return_x = isinf(x) | isnan(x) | x == 0.0; + ulong ux = as_ulong(fabs(x)); + int m = (as_int2(ux).hi >> 20) - 1023; + + // Treat subnormals + ulong uxs = as_ulong(as_double(0x3ff0000000000000UL | ux) - 1.0); + int ms = m + (as_int2(uxs).hi >> 20) - 1022; + + int c = m == -1023; + ux = c ? uxs : ux; + m = c ? ms : m; + + int mby3 = m / 3; + int rem = m - 3*mby3; + + double mf = as_double((ulong)(mby3 + 1023) << 52); + + ux &= 0x000fffffffffffffUL; + double Y = as_double(0x3fe0000000000000UL | ux); + + // nearest integer + int index = as_int2(ux).hi >> 11; + index = (0x100 | (index >> 1)) + (index & 1); + double F = (double)index * 0x1.0p-9; + + double f = Y - F; + double r = f * p_inv[index-256]; + + double z = r * fma(r, + fma(r, + fma(r, + fma(r, + fma(r, -0x1.8090d6221a247p-6, 0x1.ee7113506ac13p-6), + -0x1.511e8d2b3183bp-5), + 0x1.f9add3c0ca458p-5), + -0x1.c71c71c71c71cp-4), + 0x1.5555555555555p-2); + + double2 tv = p_rem[rem+2]; + double Rem_h = tv.s0; + double Rem_t = tv.s1; + + tv = p_cbrt[index-256]; + double F_h = tv.s0; + double F_t = tv.s1; + + double b_h = F_h * Rem_h; + double b_t = fma(Rem_t, F_h, fma(F_t, Rem_h, F_t*Rem_t)); + + double ans = fma(z, b_h, fma(z, b_t, b_t)) + b_h; + ans = copysign(ans*mf, x); + return return_x ? x : ans; +} +
diff --git a/amd-builtins/math64/cbrtD_table.h b/amd-builtins/math64/cbrtD_table.h new file mode 100644 index 0000000..5dbfe55 --- /dev/null +++ b/amd-builtins/math64/cbrtD_table.h
@@ -0,0 +1,550 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +DECLARE_TABLE(double, CBRT_TBL_INV, 257, + 0x1.0000000000000p+1, + 0x1.fe01fe01fe020p+0, + 0x1.fc07f01fc07f0p+0, + 0x1.fa11caa01fa12p+0, + 0x1.f81f81f81f820p+0, + 0x1.f6310aca0dbb5p+0, + 0x1.f44659e4a4271p+0, + 0x1.f25f644230ab5p+0, + 0x1.f07c1f07c1f08p+0, + 0x1.ee9c7f8458e02p+0, + 0x1.ecc07b301ecc0p+0, + 0x1.eae807aba01ebp+0, + 0x1.e9131abf0b767p+0, + 0x1.e741aa59750e4p+0, + 0x1.e573ac901e574p+0, + 0x1.e3a9179dc1a73p+0, + 0x1.e1e1e1e1e1e1ep+0, + 0x1.e01e01e01e01ep+0, + 0x1.de5d6e3f8868ap+0, + 0x1.dca01dca01dcap+0, + 0x1.dae6076b981dbp+0, + 0x1.d92f2231e7f8ap+0, + 0x1.d77b654b82c34p+0, + 0x1.d5cac807572b2p+0, + 0x1.d41d41d41d41dp+0, + 0x1.d272ca3fc5b1ap+0, + 0x1.d0cb58f6ec074p+0, + 0x1.cf26e5c44bfc6p+0, + 0x1.cd85689039b0bp+0, + 0x1.cbe6d9601cbe7p+0, + 0x1.ca4b3055ee191p+0, + 0x1.c8b265afb8a42p+0, + 0x1.c71c71c71c71cp+0, + 0x1.c5894d10d4986p+0, + 0x1.c3f8f01c3f8f0p+0, + 0x1.c26b5392ea01cp+0, + 0x1.c0e070381c0e0p+0, + 0x1.bf583ee868d8bp+0, + 0x1.bdd2b899406f7p+0, + 0x1.bc4fd65883e7bp+0, + 0x1.bacf914c1bad0p+0, + 0x1.b951e2b18ff23p+0, + 0x1.b7d6c3dda338bp+0, + 0x1.b65e2e3beee05p+0, + 0x1.b4e81b4e81b4fp+0, + 0x1.b37484ad806cep+0, + 0x1.b2036406c80d9p+0, + 0x1.b094b31d922a4p+0, + 0x1.af286bca1af28p+0, + 0x1.adbe87f94905ep+0, + 0x1.ac5701ac5701bp+0, + 0x1.aaf1d2f87ebfdp+0, + 0x1.a98ef606a63bep+0, + 0x1.a82e65130e159p+0, + 0x1.a6d01a6d01a6dp+0, + 0x1.a574107688a4ap+0, + 0x1.a41a41a41a41ap+0, + 0x1.a2c2a87c51ca0p+0, + 0x1.a16d3f97a4b02p+0, + 0x1.a01a01a01a01ap+0, + 0x1.9ec8e951033d9p+0, + 0x1.9d79f176b682dp+0, + 0x1.9c2d14ee4a102p+0, + 0x1.9ae24ea5510dap+0, + 0x1.999999999999ap+0, + 0x1.9852f0d8ec0ffp+0, + 0x1.970e4f80cb872p+0, + 0x1.95cbb0be377aep+0, + 0x1.948b0fcd6e9e0p+0, + 0x1.934c67f9b2ce6p+0, + 0x1.920fb49d0e229p+0, + 0x1.90d4f120190d5p+0, + 0x1.8f9c18f9c18fap+0, + 0x1.8e6527af1373fp+0, + 0x1.8d3018d3018d3p+0, + 0x1.8bfce8062ff3ap+0, + 0x1.8acb90f6bf3aap+0, + 0x1.899c0f601899cp+0, + 0x1.886e5f0abb04ap+0, + 0x1.87427bcc092b9p+0, + 0x1.8618618618618p+0, + 0x1.84f00c2780614p+0, + 0x1.83c977ab2beddp+0, + 0x1.82a4a0182a4a0p+0, + 0x1.8181818181818p+0, + 0x1.8060180601806p+0, + 0x1.7f405fd017f40p+0, + 0x1.7e225515a4f1dp+0, + 0x1.7d05f417d05f4p+0, + 0x1.7beb3922e017cp+0, + 0x1.7ad2208e0ecc3p+0, + 0x1.79baa6bb6398bp+0, + 0x1.78a4c8178a4c8p+0, + 0x1.77908119ac60dp+0, + 0x1.767dce434a9b1p+0, + 0x1.756cac201756dp+0, + 0x1.745d1745d1746p+0, + 0x1.734f0c541fe8dp+0, + 0x1.724287f46debcp+0, + 0x1.713786d9c7c09p+0, + 0x1.702e05c0b8170p+0, + 0x1.6f26016f26017p+0, + 0x1.6e1f76b4337c7p+0, + 0x1.6d1a62681c861p+0, + 0x1.6c16c16c16c17p+0, + 0x1.6b1490aa31a3dp+0, + 0x1.6a13cd1537290p+0, + 0x1.691473a88d0c0p+0, + 0x1.6816816816817p+0, + 0x1.6719f3601671ap+0, + 0x1.661ec6a5122f9p+0, + 0x1.6524f853b4aa3p+0, + 0x1.642c8590b2164p+0, + 0x1.63356b88ac0dep+0, + 0x1.623fa77016240p+0, + 0x1.614b36831ae94p+0, + 0x1.6058160581606p+0, + 0x1.5f66434292dfcp+0, + 0x1.5e75bb8d015e7p+0, + 0x1.5d867c3ece2a5p+0, + 0x1.5c9882b931057p+0, + 0x1.5babcc647fa91p+0, + 0x1.5ac056b015ac0p+0, + 0x1.59d61f123ccaap+0, + 0x1.58ed2308158edp+0, + 0x1.5805601580560p+0, + 0x1.571ed3c506b3ap+0, + 0x1.56397ba7c52e2p+0, + 0x1.5555555555555p+0, + 0x1.54725e6bb82fep+0, + 0x1.5390948f40febp+0, + 0x1.52aff56a8054bp+0, + 0x1.51d07eae2f815p+0, + 0x1.50f22e111c4c5p+0, + 0x1.5015015015015p+0, + 0x1.4f38f62dd4c9bp+0, + 0x1.4e5e0a72f0539p+0, + 0x1.4d843bedc2c4cp+0, + 0x1.4cab88725af6ep+0, + 0x1.4bd3edda68fe1p+0, + 0x1.4afd6a052bf5bp+0, + 0x1.4a27fad76014ap+0, + 0x1.49539e3b2d067p+0, + 0x1.4880522014880p+0, + 0x1.47ae147ae147bp+0, + 0x1.46dce34596066p+0, + 0x1.460cbc7f5cf9ap+0, + 0x1.453d9e2c776cap+0, + 0x1.446f86562d9fbp+0, + 0x1.43a2730abee4dp+0, + 0x1.42d6625d51f87p+0, + 0x1.420b5265e5951p+0, + 0x1.4141414141414p+0, + 0x1.40782d10e6566p+0, + 0x1.3fb013fb013fbp+0, + 0x1.3ee8f42a5af07p+0, + 0x1.3e22cbce4a902p+0, + 0x1.3d5d991aa75c6p+0, + 0x1.3c995a47babe7p+0, + 0x1.3bd60d9232955p+0, + 0x1.3b13b13b13b14p+0, + 0x1.3a524387ac822p+0, + 0x1.3991c2c187f63p+0, + 0x1.38d22d366088ep+0, + 0x1.3813813813814p+0, + 0x1.3755bd1c945eep+0, + 0x1.3698df3de0748p+0, + 0x1.35dce5f9f2af8p+0, + 0x1.3521cfb2b78c1p+0, + 0x1.34679ace01346p+0, + 0x1.33ae45b57bcb2p+0, + 0x1.32f5ced6a1dfap+0, + 0x1.323e34a2b10bfp+0, + 0x1.3187758e9ebb6p+0, + 0x1.30d190130d190p+0, + 0x1.301c82ac40260p+0, + 0x1.2f684bda12f68p+0, + 0x1.2eb4ea1fed14bp+0, + 0x1.2e025c04b8097p+0, + 0x1.2d50a012d50a0p+0, + 0x1.2c9fb4d812ca0p+0, + 0x1.2bef98e5a3711p+0, + 0x1.2b404ad012b40p+0, + 0x1.2a91c92f3c105p+0, + 0x1.29e4129e4129ep+0, + 0x1.293725bb804a5p+0, + 0x1.288b01288b013p+0, + 0x1.27dfa38a1ce4dp+0, + 0x1.27350b8812735p+0, + 0x1.268b37cd60127p+0, + 0x1.25e22708092f1p+0, + 0x1.2539d7e9177b2p+0, + 0x1.2492492492492p+0, + 0x1.23eb79717605bp+0, + 0x1.23456789abcdfp+0, + 0x1.22a0122a0122ap+0, + 0x1.21fb78121fb78p+0, + 0x1.21579804855e6p+0, + 0x1.20b470c67c0d9p+0, + 0x1.2012012012012p+0, + 0x1.1f7047dc11f70p+0, + 0x1.1ecf43c7fb84cp+0, + 0x1.1e2ef3b3fb874p+0, + 0x1.1d8f5672e4abdp+0, + 0x1.1cf06ada2811dp+0, + 0x1.1c522fc1ce059p+0, + 0x1.1bb4a4046ed29p+0, + 0x1.1b17c67f2bae3p+0, + 0x1.1a7b9611a7b96p+0, + 0x1.19e0119e0119ep+0, + 0x1.19453808ca29cp+0, + 0x1.18ab083902bdbp+0, + 0x1.1811811811812p+0, + 0x1.1778a191bd684p+0, + 0x1.16e0689427379p+0, + 0x1.1648d50fc3201p+0, + 0x1.15b1e5f75270dp+0, + 0x1.151b9a3fdd5c9p+0, + 0x1.1485f0e0acd3bp+0, + 0x1.13f0e8d344724p+0, + 0x1.135c81135c811p+0, + 0x1.12c8b89edc0acp+0, + 0x1.12358e75d3033p+0, + 0x1.11a3019a74826p+0, + 0x1.1111111111111p+0, + 0x1.107fbbe011080p+0, + 0x1.0fef010fef011p+0, + 0x1.0f5edfab325a2p+0, + 0x1.0ecf56be69c90p+0, + 0x1.0e40655826011p+0, + 0x1.0db20a88f4696p+0, + 0x1.0d24456359e3ap+0, + 0x1.0c9714fbcda3bp+0, + 0x1.0c0a7868b4171p+0, + 0x1.0b7e6ec259dc8p+0, + 0x1.0af2f722eecb5p+0, + 0x1.0a6810a6810a7p+0, + 0x1.09ddba6af8360p+0, + 0x1.0953f39010954p+0, + 0x1.08cabb37565e2p+0, + 0x1.0842108421084p+0, + 0x1.07b9f29b8eae2p+0, + 0x1.073260a47f7c6p+0, + 0x1.06ab59c7912fbp+0, + 0x1.0624dd2f1a9fcp+0, + 0x1.059eea0727586p+0, + 0x1.05197f7d73404p+0, + 0x1.04949cc1664c5p+0, + 0x1.0410410410410p+0, + 0x1.038c6b78247fcp+0, + 0x1.03091b51f5e1ap+0, + 0x1.02864fc7729e9p+0, + 0x1.0204081020408p+0, + 0x1.0182436517a37p+0, + 0x1.0101010101010p+0, + 0x1.0080402010080p+0, + 0x1.0000000000000p+0, +) + +DECLARE_TABLE(double2, CBRT_TBL, 257, + (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.0055380000000p+0, 0x1.e6a24c81e4294p-25), + (double2)(0x1.00aa390000000p+0, 0x1.8548511e3a785p-26), + (double2)(0x1.00ff010000000p+0, 0x1.4eb9336ec07f6p-25), + (double2)(0x1.0153920000000p+0, 0x1.0ea64b8b750e1p-27), + (double2)(0x1.01a7eb0000000p+0, 0x1.61637cff8a53cp-27), + (double2)(0x1.01fc0d0000000p+0, 0x1.0733bf7bd1943p-27), + (double2)(0x1.024ff80000000p+0, 0x1.666911345ccedp-26), + (double2)(0x1.02a3ad0000000p+0, 0x1.77b7a3f592f14p-27), + (double2)(0x1.02f72b0000000p+0, 0x1.f18d3dd1a5402p-25), + (double2)(0x1.034a750000000p+0, 0x1.be2f5a58ee9a4p-29), + (double2)(0x1.039d880000000p+0, 0x1.8901f8f085fa7p-25), + (double2)(0x1.03f0670000000p+0, 0x1.c68b8cd5b5d69p-26), + (double2)(0x1.0443110000000p+0, 0x1.a6b0e8624be42p-26), + (double2)(0x1.0495870000000p+0, 0x1.c4b22b06f68e7p-36), + (double2)(0x1.04e7c80000000p+0, 0x1.0f3f0afcabe9bp-25), + (double2)(0x1.0539d60000000p+0, 0x1.48495bca4e1b7p-26), + (double2)(0x1.058bb00000000p+0, 0x1.6107f1abdfdc3p-25), + (double2)(0x1.05dd570000000p+0, 0x1.e67261878288ap-25), + (double2)(0x1.062ecc0000000p+0, 0x1.a6bc155286f1ep-26), + (double2)(0x1.06800e0000000p+0, 0x1.8a759c64a85f2p-26), + (double2)(0x1.06d11e0000000p+0, 0x1.5fce70a4a8d09p-27), + (double2)(0x1.0721fc0000000p+0, 0x1.2f9cbf373fe1dp-28), + (double2)(0x1.0772a80000000p+0, 0x1.90564ce4ac359p-26), + (double2)(0x1.07c3230000000p+0, 0x1.ac29ce761b02fp-26), + (double2)(0x1.08136d0000000p+0, 0x1.cb752f497381cp-26), + (double2)(0x1.0863860000000p+0, 0x1.8bb9e1cfb35e0p-25), + (double2)(0x1.08b36f0000000p+0, 0x1.5b4917099de90p-25), + (double2)(0x1.0903280000000p+0, 0x1.cc77ac9c65ef2p-26), + (double2)(0x1.0952b10000000p+0, 0x1.7a0f3e7be3dbap-26), + (double2)(0x1.09a20a0000000p+0, 0x1.6ec851ee0c16fp-25), + (double2)(0x1.09f1340000000p+0, 0x1.89449bf2946dap-25), + (double2)(0x1.0a402f0000000p+0, 0x1.98f25301ba223p-25), + (double2)(0x1.0a8efc0000000p+0, 0x1.47d5ec651f549p-28), + (double2)(0x1.0add990000000p+0, 0x1.c33ec9a86007ap-25), + (double2)(0x1.0b2c090000000p+0, 0x1.e0b6653e92649p-26), + (double2)(0x1.0b7a4b0000000p+0, 0x1.bd64ac09d755fp-28), + (double2)(0x1.0bc85f0000000p+0, 0x1.f537506f78167p-29), + (double2)(0x1.0c16450000000p+0, 0x1.2c382d1b3735ep-25), + (double2)(0x1.0c63fe0000000p+0, 0x1.e20ed659f99e1p-25), + (double2)(0x1.0cb18b0000000p+0, 0x1.86b633a9c182ap-26), + (double2)(0x1.0cfeeb0000000p+0, 0x1.45cfd5a65e777p-27), + (double2)(0x1.0d4c1e0000000p+0, 0x1.0c8770f58bca4p-25), + (double2)(0x1.0d99250000000p+0, 0x1.739e44b0933c5p-25), + (double2)(0x1.0de6010000000p+0, 0x1.27dc3d9ce7bd8p-31), + (double2)(0x1.0e32b00000000p+0, 0x1.3c53c7c5a7b64p-25), + (double2)(0x1.0e7f340000000p+0, 0x1.9669683830cecp-25), + (double2)(0x1.0ecb8d0000000p+0, 0x1.8d772c39bdcc4p-25), + (double2)(0x1.0f17bb0000000p+0, 0x1.9b0008bcf6d7bp-25), + (double2)(0x1.0f63bf0000000p+0, 0x1.bbb305825ce4fp-28), + (double2)(0x1.0faf970000000p+0, 0x1.da3f4af13a406p-25), + (double2)(0x1.0ffb460000000p+0, 0x1.f36b96f74ce86p-26), + (double2)(0x1.1046cb0000000p+0, 0x1.65c002303f790p-30), + (double2)(0x1.1092250000000p+0, 0x1.82f84095ba7d5p-25), + (double2)(0x1.10dd560000000p+0, 0x1.d46433541b2c6p-25), + (double2)(0x1.11285e0000000p+0, 0x1.71c3d56e93a89p-25), + (double2)(0x1.11733d0000000p+0, 0x1.98dcef4e40012p-26), + (double2)(0x1.11bdf30000000p+0, 0x1.530ebef17fe03p-27), + (double2)(0x1.1208800000000p+0, 0x1.e8b8fa3715066p-27), + (double2)(0x1.1252e40000000p+0, 0x1.ab26eb3b211dcp-25), + (double2)(0x1.129d210000000p+0, 0x1.54dd4dc906307p-27), + (double2)(0x1.12e7350000000p+0, 0x1.c9f962387984ep-26), + (double2)(0x1.1331210000000p+0, 0x1.c62a959afec09p-25), + (double2)(0x1.137ae60000000p+0, 0x1.638d9ac6a866ap-25), + (double2)(0x1.13c4840000000p+0, 0x1.38704eca8a22dp-28), + (double2)(0x1.140dfa0000000p+0, 0x1.e6c9e1db14f8fp-27), + (double2)(0x1.1457490000000p+0, 0x1.8744b7f9c9eaap-26), + (double2)(0x1.14a0710000000p+0, 0x1.6c2893486373bp-25), + (double2)(0x1.14e9730000000p+0, 0x1.b36bce31699b7p-26), + (double2)(0x1.15324e0000000p+0, 0x1.71e3813d200c7p-25), + (double2)(0x1.157b030000000p+0, 0x1.99755ab40aa88p-25), + (double2)(0x1.15c3920000000p+0, 0x1.b45ca0e4bcfc0p-25), + (double2)(0x1.160bfc0000000p+0, 0x1.2dd090d869c5dp-28), + (double2)(0x1.16543f0000000p+0, 0x1.4fe0516b917dap-25), + (double2)(0x1.169c5d0000000p+0, 0x1.94563226317a2p-25), + (double2)(0x1.16e4560000000p+0, 0x1.53d8fafc2c851p-25), + (double2)(0x1.172c2a0000000p+0, 0x1.dcbd41fbd41a3p-26), + (double2)(0x1.1773d90000000p+0, 0x1.862ff5285f59cp-26), + (double2)(0x1.17bb630000000p+0, 0x1.3072ea97a1e1cp-25), + (double2)(0x1.1802c90000000p+0, 0x1.2839075184805p-26), + (double2)(0x1.184a0a0000000p+0, 0x1.4b0323e9eff42p-25), + (double2)(0x1.1891270000000p+0, 0x1.b158893c45484p-25), + (double2)(0x1.18d8210000000p+0, 0x1.149ef0fc35826p-28), + (double2)(0x1.191ef60000000p+0, 0x1.f2e77ea96acaap-26), + (double2)(0x1.1965a80000000p+0, 0x1.200074c471a95p-26), + (double2)(0x1.19ac360000000p+0, 0x1.3f8cc517f6f04p-25), + (double2)(0x1.19f2a10000000p+0, 0x1.60ba2e311bb55p-25), + (double2)(0x1.1a38e90000000p+0, 0x1.4b788730bbec3p-25), + (double2)(0x1.1a7f0e0000000p+0, 0x1.57090795ee20cp-25), + (double2)(0x1.1ac5100000000p+0, 0x1.d9ffe983670b1p-25), + (double2)(0x1.1b0af00000000p+0, 0x1.2a463ff61bfdap-25), + (double2)(0x1.1b50ad0000000p+0, 0x1.9d1bc6a5e65cfp-25), + (double2)(0x1.1b96480000000p+0, 0x1.8718abaa9e922p-25), + (double2)(0x1.1bdbc10000000p+0, 0x1.3c2f52ffa342ep-25), + (double2)(0x1.1c21180000000p+0, 0x1.0fae13ff42c80p-25), + (double2)(0x1.1c664d0000000p+0, 0x1.5440f0ef00d57p-25), + (double2)(0x1.1cab610000000p+0, 0x1.6fcd22d4e3c1ep-27), + (double2)(0x1.1cf0530000000p+0, 0x1.e0c60b409e863p-27), + (double2)(0x1.1d35230000000p+0, 0x1.f9cab5a5f0333p-25), + (double2)(0x1.1d79d30000000p+0, 0x1.30f24744c333dp-25), + (double2)(0x1.1dbe620000000p+0, 0x1.b50622a76b2fep-27), + (double2)(0x1.1e02cf0000000p+0, 0x1.fdb94ba595375p-25), + (double2)(0x1.1e471d0000000p+0, 0x1.861b9b945a171p-28), + (double2)(0x1.1e8b490000000p+0, 0x1.54348015188c4p-25), + (double2)(0x1.1ecf550000000p+0, 0x1.b54d149865523p-25), + (double2)(0x1.1f13410000000p+0, 0x1.a0bb783d9de33p-25), + (double2)(0x1.1f570d0000000p+0, 0x1.629d12b1a2157p-25), + (double2)(0x1.1f9ab90000000p+0, 0x1.467fe35d179dfp-25), + (double2)(0x1.1fde450000000p+0, 0x1.9763f3e26c8f7p-25), + (double2)(0x1.2021b20000000p+0, 0x1.3f798bb9f7679p-26), + (double2)(0x1.2064ff0000000p+0, 0x1.52e577e855898p-26), + (double2)(0x1.20a82c0000000p+0, 0x1.fde47e5502c3ap-25), + (double2)(0x1.20eb3b0000000p+0, 0x1.cbd0b548d96a0p-26), + (double2)(0x1.212e2a0000000p+0, 0x1.a9cd9f7be8de8p-25), + (double2)(0x1.2170fb0000000p+0, 0x1.22bbe704886dep-26), + (double2)(0x1.21b3ac0000000p+0, 0x1.e3dea8317f020p-25), + (double2)(0x1.21f63f0000000p+0, 0x1.e812085ac8855p-25), + (double2)(0x1.2238b40000000p+0, 0x1.c87144f24cb07p-26), + (double2)(0x1.227b0a0000000p+0, 0x1.1e128ee311fa2p-25), + (double2)(0x1.22bd420000000p+0, 0x1.b5c163d61a2d3p-26), + (double2)(0x1.22ff5c0000000p+0, 0x1.7d97e7fb90633p-27), + (double2)(0x1.2341570000000p+0, 0x1.efe899d50f6a7p-25), + (double2)(0x1.2383350000000p+0, 0x1.d0333eb75de5ap-25), + (double2)(0x1.23c4f60000000p+0, 0x1.0e590be73a573p-27), + (double2)(0x1.2406980000000p+0, 0x1.8ce8dcac3cdd2p-25), + (double2)(0x1.24481d0000000p+0, 0x1.ee8a48954064bp-25), + (double2)(0x1.2489850000000p+0, 0x1.aa62f18461e09p-25), + (double2)(0x1.24cad00000000p+0, 0x1.01e5940986a15p-25), + (double2)(0x1.250bfe0000000p+0, 0x1.b082f4f9b8d4cp-28), + (double2)(0x1.254d0e0000000p+0, 0x1.876e0e5527f5ap-25), + (double2)(0x1.258e020000000p+0, 0x1.3617080831e6bp-25), + (double2)(0x1.25ced90000000p+0, 0x1.81b26e34aa4a2p-25), + (double2)(0x1.260f940000000p+0, 0x1.52ee66dfab0c1p-26), + (double2)(0x1.2650320000000p+0, 0x1.d85a5329e8819p-26), + (double2)(0x1.2690b40000000p+0, 0x1.105c1b646b5d1p-26), + (double2)(0x1.26d1190000000p+0, 0x1.bb6690c1a379cp-25), + (double2)(0x1.2711630000000p+0, 0x1.86aeba73ce3a9p-26), + (double2)(0x1.2751900000000p+0, 0x1.dd16198294dd4p-25), + (double2)(0x1.2791a20000000p+0, 0x1.454e675775e83p-25), + (double2)(0x1.27d1980000000p+0, 0x1.3842e026197eap-25), + (double2)(0x1.2811720000000p+0, 0x1.f1ce0e70c44d2p-25), + (double2)(0x1.2851310000000p+0, 0x1.ad636441a5627p-25), + (double2)(0x1.2890d50000000p+0, 0x1.4c205d7212abbp-26), + (double2)(0x1.28d05d0000000p+0, 0x1.167c86c116419p-25), + (double2)(0x1.290fca0000000p+0, 0x1.38ec3ef16e294p-25), + (double2)(0x1.294f1c0000000p+0, 0x1.473fceace9321p-25), + (double2)(0x1.298e530000000p+0, 0x1.7af53a836dba7p-25), + (double2)(0x1.29cd700000000p+0, 0x1.a51f3c383b652p-30), + (double2)(0x1.2a0c710000000p+0, 0x1.3696da190822dp-25), + (double2)(0x1.2a4b580000000p+0, 0x1.2f9adec77074bp-25), + (double2)(0x1.2a8a250000000p+0, 0x1.8190fd5bee55fp-28), + (double2)(0x1.2ac8d70000000p+0, 0x1.bfee8fac68e55p-27), + (double2)(0x1.2b076f0000000p+0, 0x1.31c9d6bc5f68ap-28), + (double2)(0x1.2b45ec0000000p+0, 0x1.89d0523737edfp-25), + (double2)(0x1.2b84500000000p+0, 0x1.a295943bf47bbp-26), + (double2)(0x1.2bc29a0000000p+0, 0x1.96be32e5b3207p-28), + (double2)(0x1.2c00c90000000p+0, 0x1.e44c7d909fa0ep-25), + (double2)(0x1.2c3ee00000000p+0, 0x1.b2505da94d9eap-29), + (double2)(0x1.2c7cdc0000000p+0, 0x1.0c851f46c9c98p-25), + (double2)(0x1.2cbabf0000000p+0, 0x1.da71f7d9aa3b7p-26), + (double2)(0x1.2cf8880000000p+0, 0x1.f1b605d019ef1p-25), + (double2)(0x1.2d36390000000p+0, 0x1.386e8a2189563p-27), + (double2)(0x1.2d73d00000000p+0, 0x1.b19fa5d306ba7p-28), + (double2)(0x1.2db14d0000000p+0, 0x1.dd749b67aef76p-25), + (double2)(0x1.2deeb20000000p+0, 0x1.76ff6f1dc04b0p-25), + (double2)(0x1.2e2bfe0000000p+0, 0x1.35a33d0b232a6p-25), + (double2)(0x1.2e69310000000p+0, 0x1.4bdc80024a4e1p-25), + (double2)(0x1.2ea64b0000000p+0, 0x1.ebd61770fd723p-25), + (double2)(0x1.2ee34d0000000p+0, 0x1.4769fc537264dp-25), + (double2)(0x1.2f20360000000p+0, 0x1.9021f429f3b98p-25), + (double2)(0x1.2f5d070000000p+0, 0x1.ee7083efbd606p-26), + (double2)(0x1.2f99bf0000000p+0, 0x1.ad985552a6b1ap-25), + (double2)(0x1.2fd65f0000000p+0, 0x1.e3df778772160p-25), + (double2)(0x1.3012e70000000p+0, 0x1.ca5d76ddc9b34p-25), + (double2)(0x1.304f570000000p+0, 0x1.91154ffdbaf74p-25), + (double2)(0x1.308baf0000000p+0, 0x1.67bdd57fb306ap-25), + (double2)(0x1.30c7ef0000000p+0, 0x1.7dc255ac40886p-25), + (double2)(0x1.3104180000000p+0, 0x1.219f38e8afafep-32), + (double2)(0x1.3140280000000p+0, 0x1.2416bf9669a04p-25), + (double2)(0x1.317c210000000p+0, 0x1.11c96b2b3987fp-25), + (double2)(0x1.31b8020000000p+0, 0x1.f99ed447e1177p-25), + (double2)(0x1.31f3cd0000000p+0, 0x1.3245826328a11p-30), + (double2)(0x1.322f7f0000000p+0, 0x1.6f56dd1e645f8p-25), + (double2)(0x1.326b1b0000000p+0, 0x1.6164946945535p-27), + (double2)(0x1.32a69f0000000p+0, 0x1.e37d59d190028p-26), + (double2)(0x1.32e20c0000000p+0, 0x1.68671f12bf828p-25), + (double2)(0x1.331d620000000p+0, 0x1.e8ecbca6aabbdp-25), + (double2)(0x1.3358a20000000p+0, 0x1.3f49e109a5912p-26), + (double2)(0x1.3393ca0000000p+0, 0x1.b8a0e11ec3043p-25), + (double2)(0x1.33cedc0000000p+0, 0x1.5fae00aed691ap-25), + (double2)(0x1.3409d70000000p+0, 0x1.c0569bece3e4ap-25), + (double2)(0x1.3444bc0000000p+0, 0x1.05e26744efbfep-25), + (double2)(0x1.347f8a0000000p+0, 0x1.5b570a94be5c5p-25), + (double2)(0x1.34ba420000000p+0, 0x1.d6f156ea0e063p-26), + (double2)(0x1.34f4e30000000p+0, 0x1.e0ca7612fc484p-25), + (double2)(0x1.352f6f0000000p+0, 0x1.963c927b25258p-27), + (double2)(0x1.3569e40000000p+0, 0x1.47930aa725a5cp-26), + (double2)(0x1.35a4430000000p+0, 0x1.8a79fe3af43b3p-26), + (double2)(0x1.35de8c0000000p+0, 0x1.e6dc29c41bdafp-26), + (double2)(0x1.3618bf0000000p+0, 0x1.57a2e76f863a5p-25), + (double2)(0x1.3652dd0000000p+0, 0x1.ae3b61716354dp-29), + (double2)(0x1.368ce40000000p+0, 0x1.65fb5df6906b1p-25), + (double2)(0x1.36c6d60000000p+0, 0x1.6177d7f588f7bp-25), + (double2)(0x1.3700b30000000p+0, 0x1.ad55abd091b67p-28), + (double2)(0x1.373a7a0000000p+0, 0x1.55337b2422d76p-30), + (double2)(0x1.37742b0000000p+0, 0x1.084ebe86972d5p-25), + (double2)(0x1.37adc70000000p+0, 0x1.56395808e1ea3p-25), + (double2)(0x1.37e74e0000000p+0, 0x1.1bce21b40fba7p-25), + (double2)(0x1.3820c00000000p+0, 0x1.006f94605b515p-26), + (double2)(0x1.385a1c0000000p+0, 0x1.aa676aceb1f7dp-25), + (double2)(0x1.3893640000000p+0, 0x1.8229f76554ce6p-26), + (double2)(0x1.38cc960000000p+0, 0x1.eabfc6cf57330p-25), + (double2)(0x1.3905b40000000p+0, 0x1.4daed9c0ce8bcp-25), + (double2)(0x1.393ebd0000000p+0, 0x1.0ff1768237141p-25), + (double2)(0x1.3977b10000000p+0, 0x1.575f83051b085p-25), + (double2)(0x1.39b0910000000p+0, 0x1.2667deb523e29p-27), + (double2)(0x1.39e95c0000000p+0, 0x1.816996954f4fdp-30), + (double2)(0x1.3a22120000000p+0, 0x1.87cfccf4d9cd4p-26), + (double2)(0x1.3a5ab40000000p+0, 0x1.2c5d018198353p-26), + (double2)(0x1.3a93410000000p+0, 0x1.a7a898dcc34aap-25), + (double2)(0x1.3acbbb0000000p+0, 0x1.cead6dadc36d1p-29), + (double2)(0x1.3b04200000000p+0, 0x1.a55759c498bdfp-29), + (double2)(0x1.3b3c700000000p+0, 0x1.c414a9ef6de04p-25), + (double2)(0x1.3b74ad0000000p+0, 0x1.3e2108a6e58fap-25), + (double2)(0x1.3bacd60000000p+0, 0x1.587fd7643d77cp-26), + (double2)(0x1.3be4eb0000000p+0, 0x1.901eb1d3ff3dfp-28), + (double2)(0x1.3c1ceb0000000p+0, 0x1.f2ccd7c812fc6p-25), + (double2)(0x1.3c54d90000000p+0, 0x1.1c8ee70a01049p-29), + (double2)(0x1.3c8cb20000000p+0, 0x1.63e8d02831eecp-26), + (double2)(0x1.3cc4770000000p+0, 0x1.f61a42a92c7ffp-25), + (double2)(0x1.3cfc2a0000000p+0, 0x1.a917399c84d24p-34), + (double2)(0x1.3d33c80000000p+0, 0x1.e9197c8eec2f0p-26), + (double2)(0x1.3d6b530000000p+0, 0x1.e6f842f5a1378p-26), + (double2)(0x1.3da2cb0000000p+0, 0x1.fac242a90a0fcp-29), + (double2)(0x1.3dda2f0000000p+0, 0x1.35ed726610227p-26), + (double2)(0x1.3e11800000000p+0, 0x1.0e0d64804b15bp-26), + (double2)(0x1.3e48be0000000p+0, 0x1.560675daba814p-31), + (double2)(0x1.3e7fe80000000p+0, 0x1.37388c8768032p-25), + (double2)(0x1.3eb7000000000p+0, 0x1.ee3c89f9e01f5p-28), + (double2)(0x1.3eee040000000p+0, 0x1.39f6f0d09747cp-25), + (double2)(0x1.3f24f60000000p+0, 0x1.322c327abb8f0p-27), + (double2)(0x1.3f5bd40000000p+0, 0x1.961b347c8ac80p-25), + (double2)(0x1.3f92a00000000p+0, 0x1.3711fbbd0f118p-25), + (double2)(0x1.3fc9590000000p+0, 0x1.4fad8d7718ffbp-25), + (double2)(0x1.3fffff0000000p+0, 0x1.fffffffffffffp-25), + (double2)(0x1.4036930000000p+0, 0x1.67efa79ec35b4p-25), + (double2)(0x1.406d140000000p+0, 0x1.a737687a254a8p-25), + (double2)(0x1.40a3830000000p+0, 0x1.bace0f87d924dp-26), + (double2)(0x1.40d9df0000000p+0, 0x1.29e37c237e392p-25), + (double2)(0x1.4110290000000p+0, 0x1.57ce7ac3f3012p-26), + (double2)(0x1.4146600000000p+0, 0x1.82829359f8fbdp-25), + (double2)(0x1.417c850000000p+0, 0x1.cc9be42d14676p-25), + (double2)(0x1.41b2980000000p+0, 0x1.a8f001c137d0bp-25), + (double2)(0x1.41e8990000000p+0, 0x1.36127687dda05p-25), + (double2)(0x1.421e880000000p+0, 0x1.24dba322646f0p-26), + (double2)(0x1.4254640000000p+0, 0x1.dc43f1ed210b4p-25), + (double2)(0x1.428a2f0000000p+0, 0x1.31ae515c447bbp-25), +) + +DECLARE_TABLE(double2, CBRT_TBL_REM, 5, + (double2)(0x1.428a2f0000000p-1, 0x1.31ae515c447bbp-26), + (double2)(0x1.965fea0000000p-1, 0x1.4f5b8f20ac166p-27), + (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.428a2f0000000p+0, 0x1.31ae515c447bbp-25), + (double2)(0x1.965fea0000000p+0, 0x1.4f5b8f20ac166p-26), +) +
diff --git a/amd-builtins/math64/ceilD.cl b/amd-builtins/math64/ceilD.cl new file mode 100644 index 0000000..272d589 --- /dev/null +++ b/amd-builtins/math64/ceilD.cl
@@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__ ((overloadable, always_inline)) double +ceil(double x) +{ + return __amdil_round_posinf_f64(x); +} +
diff --git a/amd-builtins/math64/copysignD.cl b/amd-builtins/math64/copysignD.cl new file mode 100644 index 0000000..818e5f7 --- /dev/null +++ b/amd-builtins/math64/copysignD.cl
@@ -0,0 +1,33 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +// __hsail_ intrinsic which has no __amdil_ equivalent. +extern __attribute__((pure)) double __hsail_copysign_f64(double, double); + +__attribute__((overloadable, always_inline)) double +copysign(double x, double y) +{ + return __hsail_copysign_f64(x, y); +} +
diff --git a/amd-builtins/math64/cosD.cl b/amd-builtins/math64/cosD.cl new file mode 100644 index 0000000..74cabec --- /dev/null +++ b/amd-builtins/math64/cosD.cl
@@ -0,0 +1,48 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" +#include "sincosD_piby4.h" +#include "remainderD_piby2.h" + +__attribute__((overloadable, always_inline, pure, weak)) double +cos(double x) +{ + x = fabs(x); + + double r, rr; + int regn; + + if (x < 0x1.0p+47) + remainder_piby2_medium(x, &r, &rr, ®n); + else + remainder_piby2_large(x, &r, &rr, ®n); + + double2 sc = sincos_piby4(r, rr); + sc.lo = -sc.lo; + + int2 c = as_int2(regn & 1 ? sc.lo : sc.hi); + c.hi ^= (regn > 1) << 31; + + return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c); +} +
diff --git a/amd-builtins/math64/coshD.cl b/amd-builtins/math64/coshD.cl new file mode 100644 index 0000000..bc4a4fa --- /dev/null +++ b/amd-builtins/math64/coshD.cl
@@ -0,0 +1,111 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +cosh(double x) +{ + USE_TABLE(double2, sinh_tbl, SINH_TBL); + USE_TABLE(double2, cosh_tbl, COSH_TBL); + + // After dealing with special cases the computation is split into + // regions as follows: + // + // abs(x) >= max_cosh_arg: + // cosh(x) = sign(x)*Inf + // + // abs(x) >= small_threshold: + // cosh(x) = sign(x)*exp(abs(x))/2 computed using the + // splitexp and scaleDouble functions as for exp_amd(). + // + // abs(x) < small_threshold: + // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + // cosh(x) is then sign(x)*z. */ + + // This is ln(2^1025) + const double max_cosh_arg = 7.10475860073943977113e+02; /* 0x408633ce8fb9f87e */ + + // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27) + const double small_threshold = 0x1.2b708872320e2p+4; + + double y = fabs(x); + + // In this range we find the integer part y0 of y + // and the increment dy = y - y0. We then compute + // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy) + // where sinh(y0) and cosh(y0) are tabulated above. + + int ind = min((int)y, 36); + double dy = y - ind; + double dy2 = dy * dy; + + double sdy = dy * dy2 * + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, 0.7746188980094184251527126e-12, 0.160576793121939886190847e-9), + 0.250521176994133472333666e-7), + 0.275573191913636406057211e-5), + 0.198412698413242405162014e-3), + 0.833333333333329931873097e-2), + 0.166666666666666667013899e0); + + double cdy = dy2 * fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, 0.1163921388172173692062032e-10, 0.208744349831471353536305e-8), + 0.275573350756016588011357e-6), + 0.248015872460622433115785e-4), + 0.138888888889814854814536e-2), + 0.416666666666660876512776e-1), + 0.500000000000000005911074e0); + + // At this point sinh(dy) is approximated by dy + sdy, + // and cosh(dy) is approximated by 1 + cdy. + double2 tv = cosh_tbl[ind]; + double cl = tv.s0; + double ct = tv.s1; + tv = sinh_tbl[ind]; + double sl = tv.s0; + double st = tv.s1; + + double z = fma(sl, dy, fma(sl, sdy, fma(cl, cdy, fma(st, dy, fma(st, sdy, ct*cdy)) + ct))) + cl; + + // Other cases + z = y < 0x1.0p-28 ? 1.0 : z; + + double t = exp(y - 0x1.62e42fefa3800p-1); + t = fma(t, -0x1.ef35793c76641p-45, t); + z = y >= small_threshold ? t : z; + + z = y >= max_cosh_arg ? as_double(PINFBITPATT_DP64) : z; + + z = isinf(x) | isnan(x) ? y : z; + + return z; +} +
diff --git a/amd-builtins/math64/cospiD.cl b/amd-builtins/math64/cospiD.cl new file mode 100644 index 0000000..0297cb0 --- /dev/null +++ b/amd-builtins/math64/cospiD.cl
@@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" +#include "sincosD_piby4.h" + +__attribute__((overloadable)) double +cospi(double x) +{ + const double pi = 3.1415926535897932384626433832795; + + long ix = as_long(x) & 0x7fffffffffffffffL; + double ax = as_double(ix); + long iax = (long)ax; + double r = ax - (double)iax; + long xodd = iax & 0x1L ? 0x8000000000000000L : 0L; + + // Initialize with return for +-Inf and NaN + long ir = 0x7ff8000000000000L; + + // 2^53 <= |x| < Inf, the result is always even integer + ir = ix < 0x7ff0000000000000 ? 0x3ff0000000000000L : ir; + + // 2^52 <= |x| < 2^53, the result is always integer + ir = ax < 0x1.0p+53 ? xodd | 0x3ff0000000000000L : ir; + + // 0x1.0p-7 <= |x| < 2^52, result depends on which 0.25 interval + + // r < 1.0 + double a = 1.0 - r; + int e = 1; + long s = xodd ^ 0x8000000000000000L; + + // r <= 0.75 + int c = r <= 0.75; + double t = r - 0.5; + a = c ? t : a; + e = c ? 0 : e; + + // r < 0.5 + c = r < 0.5; + t = 0.5 - r; + a = c ? t : a; + s = c ? xodd : s; + + // r <= 0.25 + c = r <= 0.25; + a = c ? r : a; + e = c ? 1 : e; + + double2 sc = sincos_piby4(a * pi, 0.0); + long jr = s ^ as_long(e ? sc.hi : sc.lo); + + ir = ax < 0x1.0p+52 ? jr : ir; + + return as_double(ir); +} +
diff --git a/amd-builtins/math64/ep_logD.h b/amd-builtins/math64/ep_logD.h new file mode 100644 index 0000000..1ba9b15 --- /dev/null +++ b/amd-builtins/math64/ep_logD.h
@@ -0,0 +1,87 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define LN0 8.33333333333317923934e-02 +#define LN1 1.25000000037717509602e-02 +#define LN2 2.23213998791944806202e-03 +#define LN3 4.34887777707614552256e-04 + +#define LF0 8.33333333333333593622e-02 +#define LF1 1.24999999978138668903e-02 +#define LF2 2.23219810758559851206e-03 + +static inline void +ep_log(double x, int *xexp, double *r1, double *r2) +{ + USE_TABLE(double2, p_tbl, LN_TBL); + + // Computes natural log(x). Algorithm based on: + // Ping-Tak Peter Tang + // "Table-driven implementation of the logarithm function in IEEE + // floating-point arithmetic" + // ACM Transactions on Mathematical Software (TOMS) + // Volume 16, Issue 4 (December 1990) + int near_one = x >= 0x1.e0faap-1 & x <= 0x1.1082cp+0; + + ulong ux = as_ulong(x); + ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962); + int c = ux < IMPBIT_DP64; + ux = c ? uxs : ux; + int expadjust = c ? 60 : 0; + + // Store the exponent of x in xexp and put f into the range [0.5,1) + int xexp1 = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust; + double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64)); + *xexp = near_one ? 0 : xexp1; + + double r = x - 1.0; + double u1 = MATH_DIVIDE(r, 2.0 + r); + double ru1 = -r * u1; + u1 = u1 + u1; + + int index = as_int2(ux).hi >> 13; + index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); + + double f1 = index * 0x1.0p-7; + double f2 = f - f1; + double u2 = MATH_DIVIDE(f2, fma(0.5, f2, f1)); + + double2 tv = p_tbl[index - 64]; + double z1 = tv.s0; + double q = tv.s1; + + z1 = near_one ? r : z1; + q = near_one ? 0.0 : q; + double u = near_one ? u1 : u2; + double v = u*u; + + double cc = near_one ? ru1 : u2; + + double z21 = fma(v, fma(v, fma(v, LN3, LN2), LN1), LN0); + double z22 = fma(v, fma(v, LF2, LF1), LF0); + double z2 = near_one ? z21 : z22; + z2 = fma(u*v, z2, cc) + q; + + *r1 = z1; + *r2 = z2; +} +
diff --git a/amd-builtins/math64/erfD.cl b/amd-builtins/math64/erfD.cl new file mode 100644 index 0000000..1efa936 --- /dev/null +++ b/amd-builtins/math64/erfD.cl
@@ -0,0 +1,251 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +/* double erf(double x) + * double erfc(double x) + * x + * 2 |\ + * erf(x) = --------- | exp(-t*t)dt + * sqrt(pi) \| + * 0 + * + * erfc(x) = 1-erf(x) + * Note that + * erf(-x) = -erf(x) + * erfc(-x) = 2 - erfc(x) + * + * Method: + * 1. For |x| in [0, 0.84375] + * erf(x) = x + x*R(x^2) + * erfc(x) = 1 - erf(x) if x in [-.84375,0.25] + * = 0.5 + ((0.5-x)-x*R) if x in [0.25,0.84375] + * where R = P/Q where P is an odd poly of degree 8 and + * Q is an odd poly of degree 10. + * -57.90 + * | R - (erf(x)-x)/x | <= 2 + * + * + * Remark. The formula is derived by noting + * erf(x) = (2/sqrt(pi))*(x - x^3/3 + x^5/10 - x^7/42 + ....) + * and that + * 2/sqrt(pi) = 1.128379167095512573896158903121545171688 + * is close to one. The interval is chosen because the fix + * point of erf(x) is near 0.6174 (i.e., erf(x)=x when x is + * near 0.6174), and by some experiment, 0.84375 is chosen to + * guarantee the error is less than one ulp for erf. + * + * 2. For |x| in [0.84375,1.25], let s = |x| - 1, and + * c = 0.84506291151 rounded to single (24 bits) + * erf(x) = sign(x) * (c + P1(s)/Q1(s)) + * erfc(x) = (1-c) - P1(s)/Q1(s) if x > 0 + * 1+(c+P1(s)/Q1(s)) if x < 0 + * |P1/Q1 - (erf(|x|)-c)| <= 2**-59.06 + * Remark: here we use the taylor series expansion at x=1. + * erf(1+s) = erf(1) + s*Poly(s) + * = 0.845.. + P1(s)/Q1(s) + * That is, we use rational approximation to approximate + * erf(1+s) - (c = (single)0.84506291151) + * Note that |P1/Q1|< 0.078 for x in [0.84375,1.25] + * where + * P1(s) = degree 6 poly in s + * Q1(s) = degree 6 poly in s + * + * 3. For x in [1.25,1/0.35(~2.857143)], + * erfc(x) = (1/x)*exp(-x*x-0.5625+R1/S1) + * erf(x) = 1 - erfc(x) + * where + * R1(z) = degree 7 poly in z, (z=1/x^2) + * S1(z) = degree 8 poly in z + * + * 4. For x in [1/0.35,28] + * erfc(x) = (1/x)*exp(-x*x-0.5625+R2/S2) if x > 0 + * = 2.0 - (1/x)*exp(-x*x-0.5625+R2/S2) if -6<x<0 + * = 2.0 - tiny (if x <= -6) + * erf(x) = sign(x)*(1.0 - erfc(x)) if x < 6, else + * erf(x) = sign(x)*(1.0 - tiny) + * where + * R2(z) = degree 6 poly in z, (z=1/x^2) + * S2(z) = degree 7 poly in z + * + * Note1: + * To compute exp(-x*x-0.5625+R/S), let s be a single + * precision number and s := x; then + * -x*x = -s*s + (s-x)*(s+x) + * exp(-x*x-0.5626+R/S) = + * exp(-s*s-0.5625)*exp((s-x)*(s+x)+R/S); + * Note2: + * Here 4 and 5 make use of the asymptotic series + * exp(-x*x) + * erfc(x) ~ ---------- * ( 1 + Poly(1/x^2) ) + * x*sqrt(pi) + * We use rational approximation to approximate + * g(s)=f(1/x^2) = log(erfc(x)*x) - x*x + 0.5625 + * Here is the error bound for R1/S1 and R2/S2 + * |R1/S1 - f(x)| < 2**(-62.57) + * |R2/S2 - f(x)| < 2**(-61.52) + * + * 5. For inf > x >= 28 + * erf(x) = sign(x) *(1 - tiny) (raise inexact) + * erfc(x) = tiny*tiny (raise underflow) if x > 0 + * = 2 - tiny if x<0 + * + * 7. Special case: + * erf(0) = 0, erf(inf) = 1, erf(-inf) = -1, + * erfc(0) = 1, erfc(inf) = 0, erfc(-inf) = 2, + * erfc/erf(NaN) is NaN + */ + +#define AU0 -9.86494292470009928597e-03 +#define AU1 -7.99283237680523006574e-01 +#define AU2 -1.77579549177547519889e+01 +#define AU3 -1.60636384855821916062e+02 +#define AU4 -6.37566443368389627722e+02 +#define AU5 -1.02509513161107724954e+03 +#define AU6 -4.83519191608651397019e+02 + +#define AV1 3.03380607434824582924e+01 +#define AV2 3.25792512996573918826e+02 +#define AV3 1.53672958608443695994e+03 +#define AV4 3.19985821950859553908e+03 +#define AV5 2.55305040643316442583e+03 +#define AV6 4.74528541206955367215e+02 +#define AV7 -2.24409524465858183362e+01 + +#define BU0 -9.86494403484714822705e-03 +#define BU1 -6.93858572707181764372e-01 +#define BU2 -1.05586262253232909814e+01 +#define BU3 -6.23753324503260060396e+01 +#define BU4 -1.62396669462573470355e+02 +#define BU5 -1.84605092906711035994e+02 +#define BU6 -8.12874355063065934246e+01 +#define BU7 -9.81432934416914548592e+00 + +#define BV1 1.96512716674392571292e+01 +#define BV2 1.37657754143519042600e+02 +#define BV3 4.34565877475229228821e+02 +#define BV4 6.45387271733267880336e+02 +#define BV5 4.29008140027567833386e+02 +#define BV6 1.08635005541779435134e+02 +#define BV7 6.57024977031928170135e+00 +#define BV8 -6.04244152148580987438e-02 + +#define CU0 -2.36211856075265944077e-03 +#define CU1 4.14856118683748331666e-01 +#define CU2 -3.72207876035701323847e-01 +#define CU3 3.18346619901161753674e-01 +#define CU4 -1.10894694282396677476e-01 +#define CU5 3.54783043256182359371e-02 +#define CU6 -2.16637559486879084300e-03 + +#define CV1 1.06420880400844228286e-01 +#define CV2 5.40397917702171048937e-01 +#define CV3 7.18286544141962662868e-02 +#define CV4 1.26171219808761642112e-01 +#define CV5 1.36370839120290507362e-02 +#define CV6 1.19844998467991074170e-02 + +#define DU0 1.28379167095512558561e-01 +#define DU1 -3.25042107247001499370e-01 +#define DU2 -2.84817495755985104766e-02 +#define DU3 -5.77027029648944159157e-03 +#define DU4 -2.37630166566501626084e-05 + +#define DV1 3.97917223959155352819e-01 +#define DV2 6.50222499887672944485e-02 +#define DV3 5.08130628187576562776e-03 +#define DV4 1.32494738004321644526e-04 +#define DV5 -3.96022827877536812320e-06 + +__attribute__((overloadable)) double +erf(double y) +{ + double x = fabs(y); + double x2 = x * x; + double xm1 = x - 1.0; + + // Poly variable + double t = 1.0 / x2; + t = x < 1.25 ? xm1 : t; + t = x < 0.84375 ? x2 : t; + + double u, ut, v, vt; + + // Evaluate rational poly + // XXX We need to see of we can grab 16 coefficents from a table + // faster than evaluating 3 of the poly pairs + // if (x < 6.0) + u = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AU6, AU5), AU4), AU3), AU2), AU1), AU0); + v = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AV7, AV6), AV5), AV4), AV3), AV2), AV1); + + ut = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BU7, BU6), BU5), BU4), BU3), BU2), BU1), BU0); + vt = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BV8, BV7), BV6), BV5), BV4), BV3), BV2), BV1); + u = x < 0x1.6db6ep+1 ? ut : u; + v = x < 0x1.6db6ep+1 ? vt : v; + + ut = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, CU6, CU5), CU4), CU3), CU2), CU1), CU0); + vt = fma(t, fma(t, fma(t, fma(t, fma(t, CV6, CV5), CV4), CV3), CV2), CV1); + u = x < 1.25 ? ut : u; + v = x < 1.25 ? vt : v; + + ut = fma(t, fma(t, fma(t, fma(t, DU4, DU3), DU2), DU1), DU0); + vt = fma(t, fma(t, fma(t, fma(t, DV5, DV4), DV3), DV2), DV1); + u = x < 0.84375 ? ut : u; + v = x < 0.84375 ? vt : v; + + v = fma(t, v, 1.0); + + // Compute rational approximation + double q = u / v; + + // Compute results + double z = as_double(as_long(x) & 0xffffffff00000000L); + double r = exp(-z * z - 0.5625) * exp((z - x) * (z + x) + q); + r = 1.0 - r / x; + + double ret = x < 6.0 ? r : 1.0; + + r = 8.45062911510467529297e-01 + q; + ret = x < 1.25 ? r : ret; + + q = x < 0x1.0p-28 ? 1.28379167095512586316e-01 : q; + + r = fma(x, q, x); + ret = x < 0.84375 ? r : ret; + + ret = isnan(x) ? x : ret; + + return y < 0.0 ? -ret : ret; +} +
diff --git a/amd-builtins/math64/erfcD.cl b/amd-builtins/math64/erfcD.cl new file mode 100644 index 0000000..5321224 --- /dev/null +++ b/amd-builtins/math64/erfcD.cl
@@ -0,0 +1,261 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +/* double erf(double x) + * double erfc(double x) + * x + * 2 |\ + * erf(x) = --------- | exp(-t*t)dt + * sqrt(pi) \| + * 0 + * + * erfc(x) = 1-erf(x) + * Note that + * erf(-x) = -erf(x) + * erfc(-x) = 2 - erfc(x) + * + * Method: + * 1. For |x| in [0, 0.84375] + * erf(x) = x + x*R(x^2) + * erfc(x) = 1 - erf(x) if x in [-.84375,0.25] + * = 0.5 + ((0.5-x)-x*R) if x in [0.25,0.84375] + * where R = P/Q where P is an odd poly of degree 8 and + * Q is an odd poly of degree 10. + * -57.90 + * | R - (erf(x)-x)/x | <= 2 + * + * + * Remark. The formula is derived by noting + * erf(x) = (2/sqrt(pi))*(x - x^3/3 + x^5/10 - x^7/42 + ....) + * and that + * 2/sqrt(pi) = 1.128379167095512573896158903121545171688 + * is close to one. The interval is chosen because the fix + * point of erf(x) is near 0.6174 (i.e., erf(x)=x when x is + * near 0.6174), and by some experiment, 0.84375 is chosen to + * guarantee the error is less than one ulp for erf. + * + * 2. For |x| in [0.84375,1.25], let s = |x| - 1, and + * c = 0.84506291151 rounded to single (24 bits) + * erf(x) = sign(x) * (c + P1(s)/Q1(s)) + * erfc(x) = (1-c) - P1(s)/Q1(s) if x > 0 + * 1+(c+P1(s)/Q1(s)) if x < 0 + * |P1/Q1 - (erf(|x|)-c)| <= 2**-59.06 + * Remark: here we use the taylor series expansion at x=1. + * erf(1+s) = erf(1) + s*Poly(s) + * = 0.845.. + P1(s)/Q1(s) + * That is, we use rational approximation to approximate + * erf(1+s) - (c = (single)0.84506291151) + * Note that |P1/Q1|< 0.078 for x in [0.84375,1.25] + * where + * P1(s) = degree 6 poly in s + * Q1(s) = degree 6 poly in s + * + * 3. For x in [1.25,1/0.35(~2.857143)], + * erfc(x) = (1/x)*exp(-x*x-0.5625+R1/S1) + * erf(x) = 1 - erfc(x) + * where + * R1(z) = degree 7 poly in z, (z=1/x^2) + * S1(z) = degree 8 poly in z + * + * 4. For x in [1/0.35,28] + * erfc(x) = (1/x)*exp(-x*x-0.5625+R2/S2) if x > 0 + * = 2.0 - (1/x)*exp(-x*x-0.5625+R2/S2) if -6<x<0 + * = 2.0 - tiny (if x <= -6) + * erf(x) = sign(x)*(1.0 - erfc(x)) if x < 6, else + * erf(x) = sign(x)*(1.0 - tiny) + * where + * R2(z) = degree 6 poly in z, (z=1/x^2) + * S2(z) = degree 7 poly in z + * + * Note1: + * To compute exp(-x*x-0.5625+R/S), let s be a single + * precision number and s := x; then + * -x*x = -s*s + (s-x)*(s+x) + * exp(-x*x-0.5626+R/S) = + * exp(-s*s-0.5625)*exp((s-x)*(s+x)+R/S); + * Note2: + * Here 4 and 5 make use of the asymptotic series + * exp(-x*x) + * erfc(x) ~ ---------- * ( 1 + Poly(1/x^2) ) + * x*sqrt(pi) + * We use rational approximation to approximate + * g(s)=f(1/x^2) = log(erfc(x)*x) - x*x + 0.5625 + * Here is the error bound for R1/S1 and R2/S2 + * |R1/S1 - f(x)| < 2**(-62.57) + * |R2/S2 - f(x)| < 2**(-61.52) + * + * 5. For inf > x >= 28 + * erf(x) = sign(x) *(1 - tiny) (raise inexact) + * erfc(x) = tiny*tiny (raise underflow) if x > 0 + * = 2 - tiny if x<0 + * + * 7. Special case: + * erf(0) = 0, erf(inf) = 1, erf(-inf) = -1, + * erfc(0) = 1, erfc(inf) = 0, erfc(-inf) = 2, + * erfc/erf(NaN) is NaN + */ + +#define AU0 -9.86494292470009928597e-03 +#define AU1 -7.99283237680523006574e-01 +#define AU2 -1.77579549177547519889e+01 +#define AU3 -1.60636384855821916062e+02 +#define AU4 -6.37566443368389627722e+02 +#define AU5 -1.02509513161107724954e+03 +#define AU6 -4.83519191608651397019e+02 + +#define AV0 3.03380607434824582924e+01 +#define AV1 3.25792512996573918826e+02 +#define AV2 1.53672958608443695994e+03 +#define AV3 3.19985821950859553908e+03 +#define AV4 2.55305040643316442583e+03 +#define AV5 4.74528541206955367215e+02 +#define AV6 -2.24409524465858183362e+01 + +#define BU0 -9.86494403484714822705e-03 +#define BU1 -6.93858572707181764372e-01 +#define BU2 -1.05586262253232909814e+01 +#define BU3 -6.23753324503260060396e+01 +#define BU4 -1.62396669462573470355e+02 +#define BU5 -1.84605092906711035994e+02 +#define BU6 -8.12874355063065934246e+01 +#define BU7 -9.81432934416914548592e+00 + +#define BV0 1.96512716674392571292e+01 +#define BV1 1.37657754143519042600e+02 +#define BV2 4.34565877475229228821e+02 +#define BV3 6.45387271733267880336e+02 +#define BV4 4.29008140027567833386e+02 +#define BV5 1.08635005541779435134e+02 +#define BV6 6.57024977031928170135e+00 +#define BV7 -6.04244152148580987438e-02 + +#define CU0 -2.36211856075265944077e-03 +#define CU1 4.14856118683748331666e-01 +#define CU2 -3.72207876035701323847e-01 +#define CU3 3.18346619901161753674e-01 +#define CU4 -1.10894694282396677476e-01 +#define CU5 3.54783043256182359371e-02 +#define CU6 -2.16637559486879084300e-03 + +#define CV0 1.06420880400844228286e-01 +#define CV1 5.40397917702171048937e-01 +#define CV2 7.18286544141962662868e-02 +#define CV3 1.26171219808761642112e-01 +#define CV4 1.36370839120290507362e-02 +#define CV5 1.19844998467991074170e-02 + +#define DU0 1.28379167095512558561e-01 +#define DU1 -3.25042107247001499370e-01 +#define DU2 -2.84817495755985104766e-02 +#define DU3 -5.77027029648944159157e-03 +#define DU4 -2.37630166566501626084e-05 + +#define DV0 3.97917223959155352819e-01 +#define DV1 6.50222499887672944485e-02 +#define DV2 5.08130628187576562776e-03 +#define DV3 1.32494738004321644526e-04 +#define DV4 -3.96022827877536812320e-06 + +__attribute__((overloadable)) double +erfc(double x) +{ + long lx = as_long(x); + long ax = lx & 0x7fffffffffffffffL; + double absx = as_double(ax); + int xneg = lx != ax; + + // Poly arg + double x2 = x * x; + double xm1 = absx - 1.0; + double t = 1.0 / x2; + t = absx < 1.25 ? xm1 : t; + t = absx < 0.84375 ? x2 : t; + + + // Evaluate rational poly + // XXX Need to evaluate if we can grab the 14 coefficients from a + // table faster than evaluating 3 pairs of polys + double tu, tv, u, v; + + // |x| < 28 + u = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AU6, AU5), AU4), AU3), AU2), AU1), AU0); + v = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AV6, AV5), AV4), AV3), AV2), AV1), AV0); + + tu = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BU7, BU6), BU5), BU4), BU3), BU2), BU1), BU0); + tv = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BV7, BV6), BV5), BV4), BV3), BV2), BV1), BV0); + u = absx < 0x1.6db6dp+1 ? tu : u; + v = absx < 0x1.6db6dp+1 ? tv : v; + + tu = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, CU6, CU5), CU4), CU3), CU2), CU1), CU0); + tv = fma(t, fma(t, fma(t, fma(t, fma(t, CV5, CV4), CV3), CV2), CV1), CV0); + u = absx < 1.25 ? tu : u; + v = absx < 1.25 ? tv : v; + + tu = fma(t, fma(t, fma(t, fma(t, DU4, DU3), DU2), DU1), DU0); + tv = fma(t, fma(t, fma(t, fma(t, DV4, DV3), DV2), DV1), DV0); + u = absx < 0.84375 ? tu : u; + v = absx < 0.84375 ? tv : v; + + v = fma(t, v, 1.0); + double q = u / v; + + + // Evaluate return value + + // |x| < 28 + double z = as_double(ax & 0xffffffff00000000UL); + double ret = exp(-z * z - 0.5625) * exp((z - absx) * (z + absx) + q) / absx; + t = 2.0 - ret; + ret = xneg ? t : ret; + + const double erx = 8.45062911510467529297e-01; + z = erx + q + 1.0; + t = 1.0 - erx - q; + t = xneg ? z : t; + ret = absx < 1.25 ? t : ret; + + // z = 1.0 - fma(x, q, x); + // t = 0.5 - fma(x, q, x - 0.5); + // t = xneg == 1 | absx < 0.25 ? z : t; + t = fma(-x, q, 1.0 - x); + ret = absx < 0.84375 ? t : ret; + + ret = x >= 28.0 ? 0.0 : ret; + ret = x <= -6.0 ? 2.0 : ret; + ret = ax > 0x7ff0000000000000UL ? x : ret; + + return ret; +} +
diff --git a/amd-builtins/math64/exp10D.cl b/amd-builtins/math64/exp10D.cl new file mode 100644 index 0000000..b330b28 --- /dev/null +++ b/amd-builtins/math64/exp10D.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_EXP10 +#include "expD_base.h" +
diff --git a/amd-builtins/math64/exp2D.cl b/amd-builtins/math64/exp2D.cl new file mode 100644 index 0000000..938c594 --- /dev/null +++ b/amd-builtins/math64/exp2D.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_EXP2 +#include "expD_base.h" +
diff --git a/amd-builtins/math64/expD.cl b/amd-builtins/math64/expD.cl new file mode 100644 index 0000000..cf21877 --- /dev/null +++ b/amd-builtins/math64/expD.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_EXP +#include "expD_base.h" +
diff --git a/amd-builtins/math64/expD_base.h b/amd-builtins/math64/expD_base.h new file mode 100644 index 0000000..6cc2a6d --- /dev/null +++ b/amd-builtins/math64/expD_base.h
@@ -0,0 +1,139 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +// Algorithm: +// +// e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) +// +// x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer +// n = 64*m + j, 0 <= j < 64 +// +// e^x = 2^((64*m + j + f)/64) +// = (2^m) * (2^(j/64)) * 2^(f/64) +// = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) +// +// f = x*(64/ln(2)) - n +// r = f*(ln(2)/64) = x - n*(ln(2)/64) +// +// e^x = (2^m) * (2^(j/64)) * e^r +// +// (2^(j/64)) is precomputed +// +// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! +// e^r = 1 + q +// +// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! +// +// e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) + +__attribute__((overloadable, always_inline, weak)) double +#if defined COMPILING_EXP2 +exp2(double x) +#elif defined COMPILING_EXP10 +exp10(double x) +#else +exp(double x) +#endif +{ + USE_TABLE(double2, p_tbl, TWO_TO_JBY64_EP); + +#if defined(COMPILING_EXP2) + const double X_MAX = 1024.0; + const double X_MIN = -1074; +#elif defined(COMPILING_EXP10) + const double X_MAX = 0x1.34413509f79ffp+8; // 1024*ln(2)/ln(10) + const double X_MIN = -0x1.434e6420f4374p+8; // -1074*ln(2)/ln(10) +#else + const double X_MAX = 0x1.62e42fefa39efp+9; // 1024*ln(2) + const double X_MIN = -0x1.74910d52d3051p+9; // -1075*ln(2) +#endif + +#if defined(COMPILING_EXP2) + const double R_64 = 64.0; + const double R_1_BY_64 = 1.0 / 64.0; + const double R_LN2 = 0x1.62e42fefa39efp-1; // ln(2) +#elif defined(COMPILING_EXP10) + const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7; // 64*ln(10)/ln(2) + const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8; // head ln(2)/(64*ln(10)) + const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37; // tail ln(2)/(64*ln(10)) + const double R_LN10 = 0x1.26bb1bbb55516p+1; // ln(10) +#else + const double R_64_BY_LOG2 = 0x1.71547652b82fep+6; // 64/ln(2) + const double R_LOG2_BY_64_LD = 0x1.62e42fefa0000p-7; // head ln(2)/64 + const double R_LOG2_BY_64_TL = 0x1.cf79abc9e3b39p-46; // tail ln(2)/64 +#endif + +#if defined(COMPILING_EXP2) + int n = convert_int(x * R_64); +#elif defined(COMPILING_EXP10) + int n = convert_int(x * R_64_BY_LOG10_2); +#else + int n = convert_int(x * R_64_BY_LOG2); +#endif + + double dn = (double)n; + + int j = n & 0x3f; + int m = n >> 6; + +#if defined(COMPILING_EXP2) + double r = R_LN2 * fma(-R_1_BY_64, dn, x); +#elif defined(COMPILING_EXP10) + double r = R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x)); +#else + double r = fma(-R_LOG2_BY_64_TL, dn, fma(-R_LOG2_BY_64_LD, dn, x)); +#endif + + // 6 term tail of Taylor expansion of e^r + double z2 = r * fma(r, + fma(r, + fma(r, + fma(r, + fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7), + 0x1.5555555555555p-5), + 0x1.5555555555555p-3), + 0x1.0000000000000p-1), + 1.0); + + double2 tv = p_tbl[j]; + z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0; + + int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0)); + + int n1 = m >> 2; + int n2 = m-n1; + double z3= z2 * as_double(((long)n1 + 1023) << 52); + z3 *= as_double(((long)n2 + 1023) << 52); + + z2 = ldexp(z2, m); + z2 = small_value ? z3: z2; + + z2 = isnan(x) ? x : z2; + + z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2; + z2 = x < X_MIN ? 0.0 : z2; + + return z2; +} +
diff --git a/amd-builtins/math64/expD_table.h b/amd-builtins/math64/expD_table.h new file mode 100644 index 0000000..9909153 --- /dev/null +++ b/amd-builtins/math64/expD_table.h
@@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +DECLARE_TABLE(double2, TWO_TO_JBY64_EP, 64, + (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.02c9a30000000p+0, 0x1.cef00c1dcdef9p-25), + (double2)(0x1.059b0d0000000p+0, 0x1.8ac2ba1d73e2ap-27), + (double2)(0x1.0874510000000p+0, 0x1.0eb37901186bep-25), + (double2)(0x1.0b55860000000p+0, 0x1.9f3121ec53172p-25), + (double2)(0x1.0e3ec30000000p+0, 0x1.69e8d10103a17p-27), + (double2)(0x1.11301d0000000p+0, 0x1.25b50a4ebbf1ap-32), + (double2)(0x1.1429aa0000000p+0, 0x1.d525bbf668203p-25), + (double2)(0x1.172b830000000p+0, 0x1.8faa2f5b9bef9p-25), + (double2)(0x1.1a35be0000000p+0, 0x1.6df96ea796d31p-25), + (double2)(0x1.1d48730000000p+0, 0x1.68b9aa7805b80p-28), + (double2)(0x1.2063b80000000p+0, 0x1.0c519ac771dd6p-25), + (double2)(0x1.2387a60000000p+0, 0x1.ceac470cd83f5p-25), + (double2)(0x1.26b4560000000p+0, 0x1.789f37495e99cp-26), + (double2)(0x1.29e9df0000000p+0, 0x1.47f7b84b09745p-26), + (double2)(0x1.2d285a0000000p+0, 0x1.b900c2d002475p-26), + (double2)(0x1.306fe00000000p+0, 0x1.4636e2a5bd1abp-25), + (double2)(0x1.33c08b0000000p+0, 0x1.320b7fa64e430p-27), + (double2)(0x1.371a730000000p+0, 0x1.ceaa72a9c5154p-26), + (double2)(0x1.3a7db30000000p+0, 0x1.3967fdba86f24p-26), + (double2)(0x1.3dea640000000p+0, 0x1.82468446b6824p-25), + (double2)(0x1.4160a20000000p+0, 0x1.f72e29f84325bp-28), + (double2)(0x1.44e0860000000p+0, 0x1.8624b40c4dbd0p-30), + (double2)(0x1.486a2b0000000p+0, 0x1.704f3404f068ep-26), + (double2)(0x1.4bfdad0000000p+0, 0x1.4d8a89c750e5ep-26), + (double2)(0x1.4f9b270000000p+0, 0x1.a74b29ab4cf62p-26), + (double2)(0x1.5342b50000000p+0, 0x1.a753e077c2a0fp-26), + (double2)(0x1.56f4730000000p+0, 0x1.ad49f699bb2c0p-26), + (double2)(0x1.5ab07d0000000p+0, 0x1.a90a852b19260p-25), + (double2)(0x1.5e76f10000000p+0, 0x1.6b48521ba6f93p-26), + (double2)(0x1.6247eb0000000p+0, 0x1.d2ac258f87d03p-31), + (double2)(0x1.6623880000000p+0, 0x1.2a91124893ecfp-27), + (double2)(0x1.6a09e60000000p+0, 0x1.9fcef32422cbep-26), + (double2)(0x1.6dfb230000000p+0, 0x1.8ca345de441c5p-25), + (double2)(0x1.71f75e0000000p+0, 0x1.1d8bee7ba46e1p-25), + (double2)(0x1.75feb50000000p+0, 0x1.9099f22fdba6ap-26), + (double2)(0x1.7a11470000000p+0, 0x1.f580c36bea881p-27), + (double2)(0x1.7e2f330000000p+0, 0x1.b3d398841740ap-26), + (double2)(0x1.8258990000000p+0, 0x1.2999c25159f11p-25), + (double2)(0x1.868d990000000p+0, 0x1.68925d901c83bp-25), + (double2)(0x1.8ace540000000p+0, 0x1.15506dadd3e2ap-27), + (double2)(0x1.8f1ae90000000p+0, 0x1.22aee6c57304ep-25), + (double2)(0x1.93737b0000000p+0, 0x1.9b8bc9e8a0387p-29), + (double2)(0x1.97d8290000000p+0, 0x1.fbc9c9f173d24p-25), + (double2)(0x1.9c49180000000p+0, 0x1.51f8480e3e235p-27), + (double2)(0x1.a0c6670000000p+0, 0x1.6bbcac96535b5p-25), + (double2)(0x1.a5503b0000000p+0, 0x1.1f12ae45a1224p-27), + (double2)(0x1.a9e6b50000000p+0, 0x1.5e7f6fd0fac90p-26), + (double2)(0x1.ae89f90000000p+0, 0x1.2b5a75abd0e69p-25), + (double2)(0x1.b33a2b0000000p+0, 0x1.09e2bf5ed7fa1p-25), + (double2)(0x1.b7f76f0000000p+0, 0x1.7daf237553d84p-27), + (double2)(0x1.bcc1e90000000p+0, 0x1.2f074891ee83dp-30), + (double2)(0x1.c199bd0000000p+0, 0x1.b0aa538444196p-25), + (double2)(0x1.c67f120000000p+0, 0x1.cafa29694426fp-25), + (double2)(0x1.cb720d0000000p+0, 0x1.9df20d22a0797p-25), + (double2)(0x1.d072d40000000p+0, 0x1.40f12f71a1e45p-25), + (double2)(0x1.d5818d0000000p+0, 0x1.9f7490e4bb40bp-25), + (double2)(0x1.da9e600000000p+0, 0x1.ed9942b84600dp-27), + (double2)(0x1.dfc9730000000p+0, 0x1.bdcdaf5cb4656p-27), + (double2)(0x1.e502ee0000000p+0, 0x1.e2cffd89cf44cp-26), + (double2)(0x1.ea4afa0000000p+0, 0x1.52486cc2c7b9dp-27), + (double2)(0x1.efa1be0000000p+0, 0x1.cc2b44eee3fa4p-25), + (double2)(0x1.f507650000000p+0, 0x1.6dc8a80ce9f09p-25), + (double2)(0x1.fa7c180000000p+0, 0x1.9e90d82e90a7ep-28), +) +
diff --git a/amd-builtins/math64/expm1D.cl b/amd-builtins/math64/expm1D.cl new file mode 100644 index 0000000..a61384d --- /dev/null +++ b/amd-builtins/math64/expm1D.cl
@@ -0,0 +1,114 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "math64.h" + +__attribute__((overloadable)) double +expm1(double x) +{ + USE_TABLE(double2, p_tbl, TWO_TO_JBY64_EP); + + const double max_expm1_arg = 709.8; + const double min_expm1_arg = -37.42994775023704; + const double log_OnePlus_OneByFour = 0.22314355131420976; //0x3FCC8FF7C79A9A22 = log(1+1/4) + const double log_OneMinus_OneByFour = -0.28768207245178096; //0xBFD269621134DB93 = log(1-1/4) + const double sixtyfour_by_lnof2 = 92.33248261689366; //0x40571547652b82fe + const double lnof2_by_64_head = 0.010830424696223417; //0x3f862e42fefa0000 + const double lnof2_by_64_tail = 2.5728046223276688e-14; //0x3d1cf79abc9e3b39 + + + // First, assume log(1-1/4) < x < log(1+1/4) i.e -0.28768 < x < 0.22314 + double u = as_double(as_ulong(x) & 0xffffffffff000000UL); + double v = x - u; + double y = u * u * 0.5; + double z = v * (x + u) * 0.5; + + double q = fma(x, + fma(x, + fma(x, + fma(x, + fma(x, + fma(x, + fma(x, + fma(x,2.4360682937111612e-8, 2.7582184028154370e-7), + 2.7558212415361945e-6), + 2.4801576918453420e-5), + 1.9841269447671544e-4), + 1.3888888890687830e-3), + 8.3333333334012270e-3), + 4.1666666666665560e-2), + 1.6666666666666632e-1); + q *= x * x * x; + + double z1g = (u + y) + (q + (v + z)); + double z1 = x + (y + (q + z)); + z1 = y >= 0x1.0p-7 ? z1g : z1; + + // Now assume outside interval around 0 + int n = (int)(x * sixtyfour_by_lnof2); + int j = n & 0x3f; + int m = n >> 6; + + double2 tv = p_tbl[j]; + double f1 = tv.s0; + double f2 = tv.s1; + double f = f1 + f2; + + double dn = -n; + double r = fma(dn, lnof2_by_64_tail, fma(dn, lnof2_by_64_head, x)); + + q = fma(r, + fma(r, + fma(r, + fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03), + 4.16666666662260795726e-02), + 1.66666666665260878863e-01), + 5.00000000000000008883e-01); + q = fma(r*r, q, r); + + double twopm = as_double((long)(m + EXPBIAS_DP64) << EXPSHIFTBITS_DP64); + double twopmm = as_double((long)(EXPBIAS_DP64 - m) << EXPSHIFTBITS_DP64); + + // Computations for m > 52, including where result is close to Inf + ulong uval = as_ulong(0x1.0p+1023 * (f1 + (f * q + (f2)))); + int e = (int)(uval >> EXPSHIFTBITS_DP64) + 1; + + double zme1024 = as_double(((long)e << EXPSHIFTBITS_DP64) | (uval & MANTBITS_DP64)); + zme1024 = e == 2047 ? as_double(PINFBITPATT_DP64) : zme1024; + + double zmg52 = twopm * (f1 + fma(f, q, f2 - twopmm)); + zmg52 = m == 1024 ? zme1024 : zmg52; + + // For m < 53 + double zml53 = twopm * ((f1 - twopmm) + fma(f1, q, f2*(1.0 + q))); + + // For m < -7 + double zmln7 = fma(twopm, f1 + fma(f, q, f2), -1.0); + + z = m < 53 ? zml53 : zmg52; + z = m < -7 ? zmln7 : z; + z = x > log_OneMinus_OneByFour & x < log_OnePlus_OneByFour ? z1 : z; + z = x > max_expm1_arg ? as_double(PINFBITPATT_DP64) : z; + z = x < min_expm1_arg ? -1.0 : z; + + return z; +} +
diff --git a/amd-builtins/math64/fabsD.cl b/amd-builtins/math64/fabsD.cl new file mode 100644 index 0000000..eded5e3 --- /dev/null +++ b/amd-builtins/math64/fabsD.cl
@@ -0,0 +1,55 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +#define G(N) \ +__attribute__((overloadable, always_inline)) double##N \ +fabs(double##N x) \ +{ \ + double##N ret; \ + ret.lo = fabs(x.lo); \ + ret.hi = fabs(x.hi); \ + return ret; \ +} + +G(16) +G(8) +G(4) + +__attribute__((overloadable, always_inline)) double3 +fabs(double3 x) +{ + double3 ret; + ret.s01 = fabs(x.s01); + ret.s2 = fabs(x.s2); + return ret; +} + +G(2) + +__attribute__((overloadable, always_inline)) double +fabs(double x) +{ + return __amdil_fabs_f64(x); +} +
diff --git a/amd-builtins/math64/fdimD.cl b/amd-builtins/math64/fdimD.cl new file mode 100644 index 0000000..194e99c --- /dev/null +++ b/amd-builtins/math64/fdimD.cl
@@ -0,0 +1,32 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) double +fdim(double x, double y) +{ + long n = -(isnan(x) | isnan(y)) & QNANBITPATT_DP64; + long r = -(x > y) & as_long(x - y); + return as_double(n | r); +} +
diff --git a/amd-builtins/math64/floorD.cl b/amd-builtins/math64/floorD.cl new file mode 100644 index 0000000..2ac88fd --- /dev/null +++ b/amd-builtins/math64/floorD.cl
@@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__ ((overloadable, always_inline)) double +floor(double x) +{ + return __amdil_round_neginf_f64(x); +} +
diff --git a/amd-builtins/math64/fmaD.cl b/amd-builtins/math64/fmaD.cl new file mode 100644 index 0000000..8d7b8c8 --- /dev/null +++ b/amd-builtins/math64/fmaD.cl
@@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) double +fma(double x, double y, double z) +{ + return __amdil_fma_f64(x, y, z); +} +
diff --git a/amd-builtins/math64/fmaxD.cl b/amd-builtins/math64/fmaxD.cl new file mode 100644 index 0000000..682317a --- /dev/null +++ b/amd-builtins/math64/fmaxD.cl
@@ -0,0 +1,32 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +extern __attribute__((pure)) double __hsail_max_f64(double,double); + +__attribute__((overloadable, always_inline)) double +fmax(double x, double y) +{ + return __hsail_max_f64(x, y); +} +
diff --git a/amd-builtins/math64/fminD.cl b/amd-builtins/math64/fminD.cl new file mode 100644 index 0000000..30f13bb --- /dev/null +++ b/amd-builtins/math64/fminD.cl
@@ -0,0 +1,32 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +extern __attribute__((pure)) double __hsail_min_f64(double,double); + +__attribute__((overloadable, always_inline)) double +fmin(double x, double y) +{ + return __hsail_min_f64(x, y); +} +
diff --git a/amd-builtins/math64/fmodD.cl b/amd-builtins/math64/fmodD.cl new file mode 100644 index 0000000..8906e14 --- /dev/null +++ b/amd-builtins/math64/fmodD.cl
@@ -0,0 +1,27 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +#define COMPILING_FMOD +#include "remainderD.h" +
diff --git a/amd-builtins/math64/fractD.cl b/amd-builtins/math64/fractD.cl new file mode 100644 index 0000000..2d8ea83 --- /dev/null +++ b/amd-builtins/math64/fractD.cl
@@ -0,0 +1,68 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) double +fract(double x, double *ip) +{ + long j = as_long(x); + long z = j & 0x8000000000000000L; + long a = j ^ z; + long n = a == 0x7ff0000000000000L ? z : j; + long s = a != 0L & z != 0L ? 0xbff0000000000000L : z; + int e = ((int)(j >> 52) & 0x7ff) - 1023; + long m = 0x000fffffffffffffL >> e; + long k = 0x0010000000000000L >> e; + k = (j & m) != 0L & z != 0L ? k : 0L; + k += j; + k &= ~m; + k = e < 0 ? s : k; + k = e > 51 ? j : k; + double i = as_double(k); + long d = as_long(x - i); + d -= d == 0x3ff0000000000000L; + d = a ? d : z; + d = e == 1024 ? n : d; + *ip = i; + return as_double(d); +} + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline)) double +fract(double x, __local double *ip) +{ + double i; + double f = fract(x, &i); + *ip = i; + return f; +} + +__attribute__((overloadable, always_inline)) double +fract(double x, __global double *ip) +{ + double i; + double f = fract(x, &i); + *ip = i; + return f; +} +#endif
diff --git a/amd-builtins/math64/frexpD.cl b/amd-builtins/math64/frexpD.cl new file mode 100644 index 0000000..280a1c4 --- /dev/null +++ b/amd-builtins/math64/frexpD.cl
@@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline, weak)) double +frexp(double x, int *ep) +{ + long i = as_long(x); + long ai = i & 0x7fffffffffffffffL; + int d = ai > 0 & ai < 0x0010000000000000L; + // scale subnormal by 2^54 without multiplying + double s = as_double(ai | 0x0370000000000000L) - 0x1.0p-968; + ai = d ? as_long(s) : ai; + int e = (int)(ai >> 52) - 1022 - (d ? 54 : 0); + int t = ai == 0 | e == 1025; + i = (i & 0x8000000000000000L) | 0x3fe0000000000000L | (ai & 0x000fffffffffffffL); + *ep = t ? 0 : e; + return t ? x : as_double(i); +} + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) double +frexp(double x, __local int *ep) +{ + int e; + double f = frexp(x, &e); + *ep = e; + return f; +} + +__attribute__((overloadable, always_inline, weak)) double +frexp(double x, __global int *ep) +{ + int e; + double f = frexp(x, &e); + *ep = e; + return f; +} +#endif
diff --git a/amd-builtins/math64/hypotD.cl b/amd-builtins/math64/hypotD.cl new file mode 100644 index 0000000..87c2df6 --- /dev/null +++ b/amd-builtins/math64/hypotD.cl
@@ -0,0 +1,66 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +hypot(double x, double y) +{ + ulong ux = as_ulong(x) & ~SIGNBIT_DP64; + int xexp = ux >> EXPSHIFTBITS_DP64; + x = as_double(ux); + + ulong uy = as_ulong(y) & ~SIGNBIT_DP64; + int yexp = uy >> EXPSHIFTBITS_DP64; + y = as_double(uy); + + int c = xexp > EXPBIAS_DP64 + 500 | yexp > EXPBIAS_DP64 + 500; + double preadjust = c ? 0x1.0p-600 : 1.0; + double postadjust = c ? 0x1.0p+600 : 1.0; + + c = xexp < EXPBIAS_DP64 - 500 | yexp < EXPBIAS_DP64 - 500; + preadjust = c ? 0x1.0p+600 : preadjust; + postadjust = c ? 0x1.0p-600 : postadjust; + + double ax = x * preadjust; + double ay = y * preadjust; + + // The post adjust may overflow, but this can't be avoided in any case + double r = sqrt(fma(ax, ax, ay*ay)) * postadjust; + + // If the difference in exponents between x and y is large + double s = x + y; + c = abs(xexp - yexp) > MANTLENGTH_DP64 + 1; + r = c ? s : r; + + // Check for NaN + //c = x != x | y != y; + c = isnan(x) | isnan(y); + r = c ? as_double(QNANBITPATT_DP64) : r; + + // If either is Inf, we must return Inf + c = x == as_double(PINFBITPATT_DP64) | y == as_double(PINFBITPATT_DP64); + r = c ? as_double(PINFBITPATT_DP64) : r; + + return r; +} +
diff --git a/amd-builtins/math64/ilogbD.cl b/amd-builtins/math64/ilogbD.cl new file mode 100644 index 0000000..3a66936 --- /dev/null +++ b/amd-builtins/math64/ilogbD.cl
@@ -0,0 +1,37 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) int +ilogb(double x) +{ + ulong ux = as_ulong(x); + ulong ax = ux & ~SIGNBIT_DP64; + int r = (int)(ax >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; + int rs = -1011 - (int)clz(ax & MANTBITS_DP64); + r = ax < 0x0010000000000000UL ? rs : r; + r = ax > 0x7ff0000000000000UL | ax == 0UL ? 0x80000000 : r; + r = ax == 0x7ff0000000000000UL ? 0x7fffffff : r; + return r; +} +
diff --git a/amd-builtins/math64/ldexpD.cl b/amd-builtins/math64/ldexpD.cl new file mode 100644 index 0000000..58c549d --- /dev/null +++ b/amd-builtins/math64/ldexpD.cl
@@ -0,0 +1,54 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline, weak)) double +ldexp(double x, int n) +{ + long l = as_ulong(x); + int e = (l >> 52) & 0x7ff; + long s = l & 0x8000000000000000; + + ulong ux = as_ulong(x * 0x1.0p+53); + int de = ((int)(ux >> 52) & 0x7ff) - 53; + int c = e == 0; + e = c ? de: e; + + ux = c ? ux : l; + + int v = e + n; + v = clamp(v, -0x7ff, 0x7ff); + + ux &= ~EXPBITS_DP64; + + double mr = as_double(ux | ((ulong)(v+53) << 52)); + mr = mr * 0x1.0p-53; + + mr = v > 0 ? as_double(ux | ((ulong)v << 52)) : mr; + + mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64) : mr; + mr = v < -53 ? as_double(s) : mr; + + mr = (n == 0 | isinf(x) | x == 0 ) ? x : mr; + return mr; +}
diff --git a/amd-builtins/math64/lgammaD.cl b/amd-builtins/math64/lgammaD.cl new file mode 100644 index 0000000..eb5119a --- /dev/null +++ b/amd-builtins/math64/lgammaD.cl
@@ -0,0 +1,306 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +// ==================================================== +// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. +// +// Developed at SunPro, a Sun Microsystems, Inc. business. +// Permission to use, copy, modify, and distribute this +// software is freely granted, provided that this notice +// is preserved. +// ==================================================== + +// lgamma_r(x, i) +// Reentrant version of the logarithm of the Gamma function +// with user provide pointer for the sign of Gamma(x). +// +// Method: +// 1. Argument Reduction for 0 < x <= 8 +// Since gamma(1+s)=s*gamma(s), for x in [0,8], we may +// reduce x to a number in [1.5,2.5] by +// lgamma(1+s) = log(s) + lgamma(s) +// for example, +// lgamma(7.3) = log(6.3) + lgamma(6.3) +// = log(6.3*5.3) + lgamma(5.3) +// = log(6.3*5.3*4.3*3.3*2.3) + lgamma(2.3) +// 2. Polynomial approximation of lgamma around its +// minimun ymin=1.461632144968362245 to maintain monotonicity. +// On [ymin-0.23, ymin+0.27] (i.e., [1.23164,1.73163]), use +// Let z = x-ymin; +// lgamma(x) = -1.214862905358496078218 + z^2*poly(z) +// where +// poly(z) is a 14 degree polynomial. +// 2. Rational approximation in the primary interval [2,3] +// We use the following approximation: +// s = x-2.0; +// lgamma(x) = 0.5*s + s*P(s)/Q(s) +// with accuracy +// |P/Q - (lgamma(x)-0.5s)| < 2**-61.71 +// Our algorithms are based on the following observation +// +// zeta(2)-1 2 zeta(3)-1 3 +// lgamma(2+s) = s*(1-Euler) + --------- * s - --------- * s + ... +// 2 3 +// +// where Euler = 0.5771... is the Euler constant, which is very +// close to 0.5. +// +// 3. For x>=8, we have +// lgamma(x)~(x-0.5)log(x)-x+0.5*log(2pi)+1/(12x)-1/(360x**3)+.... +// (better formula: +// lgamma(x)~(x-0.5)*(log(x)-1)-.5*(log(2pi)-1) + ...) +// Let z = 1/x, then we approximation +// f(z) = lgamma(x) - (x-0.5)(log(x)-1) +// by +// 3 5 11 +// w = w0 + w1*z + w2*z + w3*z + ... + w6*z +// where +// |w - f(z)| < 2**-58.74 +// +// 4. For negative x, since (G is gamma function) +// -x*G(-x)*G(x) = pi/sin(pi*x), +// we have +// G(x) = pi/(sin(pi*x)*(-x)*G(-x)) +// since G(-x) is positive, sign(G(x)) = sign(sin(pi*x)) for x<0 +// Hence, for x<0, signgam = sign(sin(pi*x)) and +// lgamma(x) = log(|Gamma(x)|) +// = log(pi/(|x*sin(pi*x)|)) - lgamma(-x); +// Note: one should avoid compute pi*(-x) directly in the +// computation of sin(pi*(-x)). +// +// 5. Special Cases +// lgamma(2+s) ~ s*(1-Euler) for tiny s +// lgamma(1)=lgamma(2)=0 +// lgamma(x) ~ -log(x) for tiny x +// lgamma(0) = lgamma(inf) = inf +// lgamma(-integer) = +-inf +// + +#define pi 3.14159265358979311600e+00 /* 0x400921FB, 0x54442D18 */ + +#define a0 7.72156649015328655494e-02 /* 0x3FB3C467, 0xE37DB0C8 */ +#define a1 3.22467033424113591611e-01 /* 0x3FD4A34C, 0xC4A60FAD */ +#define a2 6.73523010531292681824e-02 /* 0x3FB13E00, 0x1A5562A7 */ +#define a3 2.05808084325167332806e-02 /* 0x3F951322, 0xAC92547B */ +#define a4 7.38555086081402883957e-03 /* 0x3F7E404F, 0xB68FEFE8 */ +#define a5 2.89051383673415629091e-03 /* 0x3F67ADD8, 0xCCB7926B */ +#define a6 1.19270763183362067845e-03 /* 0x3F538A94, 0x116F3F5D */ +#define a7 5.10069792153511336608e-04 /* 0x3F40B6C6, 0x89B99C00 */ +#define a8 2.20862790713908385557e-04 /* 0x3F2CF2EC, 0xED10E54D */ +#define a9 1.08011567247583939954e-04 /* 0x3F1C5088, 0x987DFB07 */ +#define a10 2.52144565451257326939e-05 /* 0x3EFA7074, 0x428CFA52 */ +#define a11 4.48640949618915160150e-05 /* 0x3F07858E, 0x90A45837 */ + +#define tc 1.46163214496836224576e+00 /* 0x3FF762D8, 0x6356BE3F */ +#define tf -1.21486290535849611461e-01 /* 0xBFBF19B9, 0xBCC38A42 */ +#define tt -3.63867699703950536541e-18 /* 0xBC50C7CA, 0xA48A971F */ + +#define t0 4.83836122723810047042e-01 /* 0x3FDEF72B, 0xC8EE38A2 */ +#define t1 -1.47587722994593911752e-01 /* 0xBFC2E427, 0x8DC6C509 */ +#define t2 6.46249402391333854778e-02 /* 0x3FB08B42, 0x94D5419B */ +#define t3 -3.27885410759859649565e-02 /* 0xBFA0C9A8, 0xDF35B713 */ +#define t4 1.79706750811820387126e-02 /* 0x3F9266E7, 0x970AF9EC */ +#define t5 -1.03142241298341437450e-02 /* 0xBF851F9F, 0xBA91EC6A */ +#define t6 6.10053870246291332635e-03 /* 0x3F78FCE0, 0xE370E344 */ +#define t7 -3.68452016781138256760e-03 /* 0xBF6E2EFF, 0xB3E914D7 */ +#define t8 2.25964780900612472250e-03 /* 0x3F6282D3, 0x2E15C915 */ +#define t9 -1.40346469989232843813e-03 /* 0xBF56FE8E, 0xBF2D1AF1 */ +#define t10 8.81081882437654011382e-04 /* 0x3F4CDF0C, 0xEF61A8E9 */ +#define t11 -5.38595305356740546715e-04 /* 0xBF41A610, 0x9C73E0EC */ +#define t12 3.15632070903625950361e-04 /* 0x3F34AF6D, 0x6C0EBBF7 */ +#define t13 -3.12754168375120860518e-04 /* 0xBF347F24, 0xECC38C38 */ +#define t14 3.35529192635519073543e-04 /* 0x3F35FD3E, 0xE8C2D3F4 */ + +#define u0 -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */ +#define u1 6.32827064025093366517e-01 /* 0x3FE4401E, 0x8B005DFF */ +#define u2 1.45492250137234768737e+00 /* 0x3FF7475C, 0xD119BD6F */ +#define u3 9.77717527963372745603e-01 /* 0x3FEF4976, 0x44EA8450 */ +#define u4 2.28963728064692451092e-01 /* 0x3FCD4EAE, 0xF6010924 */ +#define u5 1.33810918536787660377e-02 /* 0x3F8B678B, 0xBF2BAB09 */ + +#define v1 2.45597793713041134822e+00 /* 0x4003A5D7, 0xC2BD619C */ +#define v2 2.12848976379893395361e+00 /* 0x40010725, 0xA42B18F5 */ +#define v3 7.69285150456672783825e-01 /* 0x3FE89DFB, 0xE45050AF */ +#define v4 1.04222645593369134254e-01 /* 0x3FBAAE55, 0xD6537C88 */ +#define v5 3.21709242282423911810e-03 /* 0x3F6A5ABB, 0x57D0CF61 */ + +#define s0 -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */ +#define s1 2.14982415960608852501e-01 /* 0x3FCB848B, 0x36E20878 */ +#define s2 3.25778796408930981787e-01 /* 0x3FD4D98F, 0x4F139F59 */ +#define s3 1.46350472652464452805e-01 /* 0x3FC2BB9C, 0xBEE5F2F7 */ +#define s4 2.66422703033638609560e-02 /* 0x3F9B481C, 0x7E939961 */ +#define s5 1.84028451407337715652e-03 /* 0x3F5E26B6, 0x7368F239 */ +#define s6 3.19475326584100867617e-05 /* 0x3F00BFEC, 0xDD17E945 */ + +#define r1 1.39200533467621045958e+00 /* 0x3FF645A7, 0x62C4AB74 */ +#define r2 7.21935547567138069525e-01 /* 0x3FE71A18, 0x93D3DCDC */ +#define r3 1.71933865632803078993e-01 /* 0x3FC601ED, 0xCCFBDF27 */ +#define r4 1.86459191715652901344e-02 /* 0x3F9317EA, 0x742ED475 */ +#define r5 7.77942496381893596434e-04 /* 0x3F497DDA, 0xCA41A95B */ +#define r6 7.32668430744625636189e-06 /* 0x3EDEBAF7, 0xA5B38140 */ + +#define w0 4.18938533204672725052e-01 /* 0x3FDACFE3, 0x90C97D69 */ +#define w1 8.33333333333329678849e-02 /* 0x3FB55555, 0x5555553B */ +#define w2 -2.77777777728775536470e-03 /* 0xBF66C16C, 0x16B02E5C */ +#define w3 7.93650558643019558500e-04 /* 0x3F4A019F, 0x98CF38B6 */ +#define w4 -5.95187557450339963135e-04 /* 0xBF4380CB, 0x8C0FE741 */ +#define w5 8.36339918996282139126e-04 /* 0x3F4B67BA, 0x4CDAD5D1 */ +#define w6 -1.63092934096575273989e-03 /* 0xBF5AB89D, 0x0B9E43E4 */ + +__attribute__ ((overloadable, always_inline)) double +lgamma_r(double x, int *ip) +{ + ulong ux = as_ulong(x); + ulong ax = ux & EXSIGNBIT_DP64; + double absx = as_double(ax); + + if (ax >= 0x7ff0000000000000UL) { + // +-Inf, NaN + *ip = 1; + return absx; + } + + if (absx < 0x1.0p-70) { + *ip = ax == ux ? 1 : -1; + return -log(absx); + } + + // Handle rest of range + double r; + + if (absx < 2.0) { + int i = 0; + double y = 2.0 - absx; + + int c = absx < 0x1.bb4c3p+0; + double t = absx - tc; + i = c ? 1 : i; + y = c ? t : y; + + c = absx < 0x1.3b4c4p+0; + t = absx - 1.0; + i = c ? 2 : i; + y = c ? t : y; + + c = absx <= 0x1.cccccp-1; + t = -log(absx); + r = c ? t : 0.0; + t = 1.0 - absx; + i = c ? 0 : i; + y = c ? t : y; + + c = absx < 0x1.76944p-1; + t = absx - (tc - 1.0); + i = c ? 1 : i; + y = c ? t : y; + + c = absx < 0x1.da661p-3; + i = c ? 2 : i; + y = c ? absx : y; + + double p, q; + + switch (i) { + case 0: + p = fma(y, fma(y, fma(y, fma(y, a11, a10), a9), a8), a7); + p = fma(y, fma(y, fma(y, fma(y, p, a6), a5), a4), a3); + p = fma(y, fma(y, fma(y, p, a2), a1), a0); + r = fma(y, p - 0.5, r); + break; + case 1: + p = fma(y, fma(y, fma(y, fma(y, t14, t13), t12), t11), t10); + p = fma(y, fma(y, fma(y, fma(y, fma(y, p, t9), t8), t7), t6), t5); + p = fma(y, fma(y, fma(y, fma(y, fma(y, p, t4), t3), t2), t1), t0); + p = fma(y*y, p, -tt); + r += (tf + p); + break; + case 2: + p = y*fma(y, fma(y, fma(y, fma(y, fma(y, u5, u4), u3), u2), u1), u0); + q = fma(y, fma(y, fma(y, fma(y, fma(y, v5, v4), v3), v2), v1), 1.0); + r += fma(-0.5, y, p/q); + } + } else if (absx < 8.0) { + int i = absx; + double y = absx - (double)i; + double p = y*fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, s6, s5), s4), s3), s2), s1), s0); + double q = fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, r6, r5), r4), r3), r2), r1), 1.0); + r = fma(0.5, y, p/q); + double z = 1.0; + // lgamma(1+s) = log(s) + lgamma(s) + double y6 = y + 6.0; + double y5 = y + 5.0; + double y4 = y + 4.0; + double y3 = y + 3.0; + double y2 = y + 2.0; + z *= i > 6 ? y6 : 1.0; + z *= i > 5 ? y5 : 1.0; + z *= i > 4 ? y4 : 1.0; + z *= i > 3 ? y3 : 1.0; + z *= i > 2 ? y2 : 1.0; + r += log(z); + } else { + double z = 1.0 / absx; + double z2 = z * z; + double w = fma(z, fma(z2, fma(z2, fma(z2, fma(z2, fma(z2, w6, w5), w4), w3), w2), w1), w0); + r = (absx - 0.5) * (log(absx) - 1.0) + w; + } + + if (x < 0.0) { + double t = sinpi(x); + r = log(pi / fabs(t * x)) - r; + r = t == 0.0 ? as_double(PINFBITPATT_DP64) : r; + *ip = t < 0.0 ? -1 : 1; + } else + *ip = 1; + + return r; +} + +#if __OPENCL_C_VERSION__ < 200 +__attribute__ ((overloadable, always_inline)) double +lgamma_r(double x, __local int *ip) +{ + int i; + double ret = lgamma_r(x, &i); + *ip = i; + return ret; +} + +__attribute__ ((overloadable, always_inline)) double +lgamma_r(double x, __global int *ip) +{ + int i; + double ret = lgamma_r(x, &i); + *ip = i; + return ret; +} +#endif + +__attribute__ ((overloadable, always_inline)) double +lgamma(double x) +{ + int i; + return lgamma_r(x, &i); +} +
diff --git a/amd-builtins/math64/log10D.cl b/amd-builtins/math64/log10D.cl new file mode 100644 index 0000000..df53cd8 --- /dev/null +++ b/amd-builtins/math64/log10D.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_LOG10 +#include "logD_base.h" +
diff --git a/amd-builtins/math64/log1pD.cl b/amd-builtins/math64/log1pD.cl new file mode 100644 index 0000000..b1aaf5d --- /dev/null +++ b/amd-builtins/math64/log1pD.cl
@@ -0,0 +1,105 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +log1p(double x) +{ + USE_TABLE(double2, p_tbl, LN_TBL); + + // Computes natural log(1+x). Algorithm based on: + // Ping-Tak Peter Tang + // "Table-driven implementation of the logarithm function in IEEE + // floating-point arithmetic" + // ACM Transactions on Mathematical Software (TOMS) + // Volume 16, Issue 4 (December 1990) + // Note that we use a lookup table of size 64 rather than 128, + // and compensate by having extra terms in the minimax polynomial + // for the kernel approximation. + + // Process Inside the threshold now + ulong ux = as_ulong(1.0 + x); + int xexp = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64; + double f = as_double(ONEEXPBITS_DP64 | (ux & MANTBITS_DP64)); + + int j = as_int2(ux).hi >> 13; + j = ((0x80 | (j & 0x7e)) >> 1) + (j & 0x1); + double f1 = (double)j * 0x1.0p-6; + j -= 64; + + double f2temp = f - f1; + double m2 = as_double(convert_ulong(0x3ff - xexp) << EXPSHIFTBITS_DP64); + double f2l = fma(m2, x, m2 - f1); + double f2g = fma(m2, x, -f1) + m2; + double f2 = xexp <= MANTLENGTH_DP64-1 ? f2l : f2g; + f2 = (xexp <= -2) | (xexp >= MANTLENGTH_DP64+8) ? f2temp : f2; + + double2 tv = p_tbl[j]; + double z1 = tv.s0; + double q = tv.s1; + + double u = MATH_DIVIDE(f2, fma(0.5, f2, f1)); + double v = u * u; + + double poly = v * fma(v, + fma(v, 2.23219810758559851206e-03, 1.24999999978138668903e-02), + 8.33333333333333593622e-02); + + // log2_lead and log2_tail sum to an extra-precise version of log(2) + const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */ + const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */ + + double z2 = q + fma(u, poly, u); + double dxexp = (double)xexp; + double r1 = fma(dxexp, log2_lead, z1); + double r2 = fma(dxexp, log2_tail, z2); + double result1 = r1 + r2; + + // Process Outside the threshold now + double r = x; + u = r / (2.0 + r); + double correction = r * u; + u = u + u; + v = u * u; + r1 = r; + + poly = fma(v, + fma(v, + fma(v, 4.34887777707614552256e-04, 2.23213998791944806202e-03), + 1.25000000037717509602e-02), + 8.33333333333317923934e-02); + + r2 = fma(u*v, poly, -correction); + + // The values exp(-1/16)-1 and exp(1/16)-1 + const double log1p_thresh1 = -0x1.f0540438fd5c3p-5; + const double log1p_thresh2 = 0x1.082b577d34ed8p-4; + double result2 = r1 + r2; + result2 = x < log1p_thresh1 | x > log1p_thresh2 ? result1 : result2; + + result2 = isinf(x) ? x : result2; + result2 = x < -1.0 ? as_double(QNANBITPATT_DP64) : result2; + result2 = x == -1.0 ? as_double(NINFBITPATT_DP64) : result2; + return result2; +} +
diff --git a/amd-builtins/math64/log2D.cl b/amd-builtins/math64/log2D.cl new file mode 100644 index 0000000..86010f6 --- /dev/null +++ b/amd-builtins/math64/log2D.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_LOG2 +#include "logD_base.h" +
diff --git a/amd-builtins/math64/logD.cl b/amd-builtins/math64/logD.cl new file mode 100644 index 0000000..0cd4763 --- /dev/null +++ b/amd-builtins/math64/logD.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_LOG +#include "logD_base.h" +
diff --git a/amd-builtins/math64/logD_base.h b/amd-builtins/math64/logD_base.h new file mode 100644 index 0000000..a07e902 --- /dev/null +++ b/amd-builtins/math64/logD_base.h
@@ -0,0 +1,193 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +// Algorithm: +// +// Based on: +// Ping-Tak Peter Tang +// "Table-driven implementation of the logarithm function in IEEE +// floating-point arithmetic" +// ACM Transactions on Mathematical Software (TOMS) +// Volume 16, Issue 4 (December 1990) +// +// +// x very close to 1.0 is handled differently, for x everywhere else +// a brief explanation is given below +// +// x = (2^m)*A +// x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8)) +// x = (2^m)*2*(G/2+g/2) +// x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9)) +// +// Y = (2^(-1))*(2^(-m))*(2^m)*A +// Now, range of Y is: 0.5 <= Y < 1 +// +// F = 0x80 + (first 7 mantissa bits) + (8th mantissa bit) +// Now, range of F is: 128 <= F <= 256 +// F = F / 256 +// Now, range of F is: 0.5 <= F <= 1 +// +// f = -(Y-F), with (f <= 2^(-9)) +// +// log(x) = m*log(2) + log(2) + log(F-f) +// log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F)) +// log(x) = m*log(2) + log(2*F) + log(1-r) +// +// r = (f/F), with (r <= 2^(-8)) +// r = f*(1/F) with (1/F) precomputed to avoid division +// +// log(x) = m*log(2) + log(G) - poly +// +// log(G) is precomputed +// poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5)) +// +// log(2) and log(G) need to be maintained in extra precision +// to avoid losing precision in the calculations +// +// +// For x close to 1.0, we employ the following technique to +// ensure faster convergence. +// +// log(x) = log((1+s)/(1-s)) = 2*s + (2/3)*s^3 + (2/5)*s^5 + (2/7)*s^7 +// x = ((1+s)/(1-s)) +// x = 1 + r +// s = r/(2+r) + +__attribute__((overloadable)) double +#if defined(COMPILING_LOG2) +log2(double x) +#elif defined(COMPILING_LOG10) +log10(double x) +#else +log(double x) +#endif +{ + USE_TABLE(double2, p_tbl, LN_TBL); + +#ifndef COMPILING_LOG2 + // log2_lead and log2_tail sum to an extra-precise version of ln(2) + const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */ + const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */ +#endif + +#if defined(COMPILING_LOG10) + // log10e_lead and log10e_tail sum to an extra-precision version of log10(e) (19 bits in lead) + const double log10e_lead = 4.34293746948242187500e-01; /* 0x3fdbcb7800000000 */ + const double log10e_tail = 7.3495500964015109100644e-7; /* 0x3ea8a93728719535 */ +#elif defined(COMPILING_LOG2) + // log2e_lead and log2e_tail sum to an extra-precision version of log2(e) (19 bits in lead) + const double log2e_lead = 1.44269180297851562500E+00; /* 0x3FF7154400000000 */ + const double log2e_tail = 3.23791044778235969970E-06; /* 0x3ECB295C17F0BBBE */ +#endif + + // log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000 + // log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 + const double log_thresh1 = 0x1.e0faap-1; + const double log_thresh2 = 0x1.1082cp+0; + + int is_near = x >= log_thresh1 & x <= log_thresh2; + + // Near 1 code + double r = x - 1.0; + double u = r / (2.0 + r); + double correction = r * u; + u = u + u; + double v = u * u; + double r1 = r; + + const double ca_1 = 8.33333333333317923934e-02; /* 0x3fb55555555554e6 */ + const double ca_2 = 1.25000000037717509602e-02; /* 0x3f89999999bac6d4 */ + const double ca_3 = 2.23213998791944806202e-03; /* 0x3f62492307f1519f */ + const double ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */ + + double r2 = fma(u*v, fma(v, fma(v, fma(v, ca_4, ca_3), ca_2), ca_1), -correction); + +#if defined(COMPILING_LOG10) + r = r1; + r1 = as_double(as_ulong(r1) & 0xffffffff00000000); + r2 = r2 + (r - r1); + double ret_near = fma(log10e_lead, r1, fma(log10e_lead, r2, fma(log10e_tail, r1, log10e_tail * r2))); +#elif defined(COMPILING_LOG2) + r = r1; + r1 = as_double(as_ulong(r1) & 0xffffffff00000000); + r2 = r2 + (r - r1); + double ret_near = fma(log2e_lead, r1, fma(log2e_lead, r2, fma(log2e_tail, r1, log2e_tail*r2))); +#else + double ret_near = r1 + r2; +#endif + + // This is the far from 1 code + + // Deal with subnormal + ulong ux = as_ulong(x); + ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962); + int c = ux < IMPBIT_DP64; + ux = c ? uxs : ux; + int expadjust = c ? 60 : 0; + + int xexp = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust; + double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64)); + int index = as_int2(ux).hi >> 13; + index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); + + double2 tv = p_tbl[index - 64]; + double z1 = tv.s0; + double q = tv.s1; + + double f1 = index * 0x1.0p-7; + double f2 = f - f1; + u = f2 / fma(f2, 0.5, f1); + v = u * u; + + const double cb_1 = 8.33333333333333593622e-02; /* 0x3fb5555555555557 */ + const double cb_2 = 1.24999999978138668903e-02; /* 0x3f89999999865ede */ + const double cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */ + + double poly = v * fma(v, fma(v, cb_3, cb_2), cb_1); + double z2 = q + fma(u, poly, u); + + double dxexp = (double)xexp; +#if defined (COMPILING_LOG10) + // Add xexp * log(2) to z1,z2 to get log(x) + r1 = fma(dxexp, log2_lead, z1); + r2 = fma(dxexp, log2_tail, z2); + double ret_far = fma(log10e_lead, r1, fma(log10e_lead, r2, fma(log10e_tail, r1, log10e_tail*r2))); +#elif defined(COMPILING_LOG2) + r1 = fma(log2e_lead, z1, dxexp); + r2 = fma(log2e_lead, z2, fma(log2e_tail, z1, log2e_tail*z2)); + double ret_far = r1 + r2; +#else + r1 = fma(dxexp, log2_lead, z1); + r2 = fma(dxexp, log2_tail, z2); + double ret_far = r1 + r2; +#endif + + double ret = is_near ? ret_near : ret_far; + + ret = isinf(x) ? as_double(PINFBITPATT_DP64) : ret; + ret = isnan(x) | x < 0.0 ? as_double(QNANBITPATT_DP64) : ret; + ret = x == 0.0 ? as_double(NINFBITPATT_DP64) : ret; + return ret; +} +
diff --git a/amd-builtins/math64/logD_table.h b/amd-builtins/math64/logD_table.h new file mode 100644 index 0000000..a7ccc20 --- /dev/null +++ b/amd-builtins/math64/logD_table.h
@@ -0,0 +1,90 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +DECLARE_TABLE(double2, LN_TBL, 65, + (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.fc0a800000000p-7, 0x1.61f807c79f3dbp-28), + (double2)(0x1.f829800000000p-6, 0x1.873c1980267c8p-25), + (double2)(0x1.7745800000000p-5, 0x1.ec65b9f88c69ep-26), + (double2)(0x1.f0a3000000000p-5, 0x1.8022c54cc2f99p-26), + (double2)(0x1.341d700000000p-4, 0x1.2c37a3a125330p-25), + (double2)(0x1.6f0d200000000p-4, 0x1.15cad69737c93p-25), + (double2)(0x1.a926d00000000p-4, 0x1.d256ab1b285e9p-27), + (double2)(0x1.e270700000000p-4, 0x1.b8abcb97a7aa2p-26), + (double2)(0x1.0d77e00000000p-3, 0x1.f34239659a5dcp-25), + (double2)(0x1.2955280000000p-3, 0x1.e07fd48d30177p-25), + (double2)(0x1.44d2b00000000p-3, 0x1.b32df4799f4f6p-25), + (double2)(0x1.5ff3000000000p-3, 0x1.c29e4f4f21cf8p-25), + (double2)(0x1.7ab8900000000p-3, 0x1.086c848df1b59p-30), + (double2)(0x1.9525a80000000p-3, 0x1.cf456b4764130p-27), + (double2)(0x1.af3c900000000p-3, 0x1.3a02ffcb63398p-25), + (double2)(0x1.c8ff780000000p-3, 0x1.1e6a6886b0976p-25), + (double2)(0x1.e270700000000p-3, 0x1.b8abcb97a7aa2p-25), + (double2)(0x1.fb91800000000p-3, 0x1.b578f8aa35552p-25), + (double2)(0x1.0a324c0000000p-2, 0x1.139c871afb9fcp-25), + (double2)(0x1.1675c80000000p-2, 0x1.5d5d30701ce64p-25), + (double2)(0x1.22941c0000000p-2, 0x1.de7bcb2d12142p-25), + (double2)(0x1.2e8e280000000p-2, 0x1.d708e984e1664p-25), + (double2)(0x1.3a64c40000000p-2, 0x1.56945e9c72f36p-26), + (double2)(0x1.4618bc0000000p-2, 0x1.0e2f613e85bdap-29), + (double2)(0x1.51aad80000000p-2, 0x1.cb7e0b42724f6p-28), + (double2)(0x1.5d1bd80000000p-2, 0x1.fac04e52846c7p-25), + (double2)(0x1.686c800000000p-2, 0x1.e9b14aec442bep-26), + (double2)(0x1.739d7c0000000p-2, 0x1.b5de8034e7126p-25), + (double2)(0x1.7eaf800000000p-2, 0x1.dc157e1b259d3p-25), + (double2)(0x1.89a3380000000p-2, 0x1.b05096ad69c62p-28), + (double2)(0x1.9479400000000p-2, 0x1.c2116faba4cddp-26), + (double2)(0x1.9f323c0000000p-2, 0x1.65fcc25f95b47p-25), + (double2)(0x1.a9cec80000000p-2, 0x1.a9a08498d4850p-26), + (double2)(0x1.b44f740000000p-2, 0x1.de647b1465f77p-25), + (double2)(0x1.beb4d80000000p-2, 0x1.da71b7bf7861dp-26), + (double2)(0x1.c8ff7c0000000p-2, 0x1.e6a6886b09760p-28), + (double2)(0x1.d32fe40000000p-2, 0x1.f0075eab0ef64p-25), + (double2)(0x1.dd46a00000000p-2, 0x1.3071282fb989bp-28), + (double2)(0x1.e744240000000p-2, 0x1.0eb43c3f1bed2p-25), + (double2)(0x1.f128f40000000p-2, 0x1.faf06ecb35c84p-26), + (double2)(0x1.faf5880000000p-2, 0x1.ef1e63db35f68p-27), + (double2)(0x1.02552a0000000p-1, 0x1.69743fb1a71a5p-27), + (double2)(0x1.0723e40000000p-1, 0x1.c1cdf404e5796p-25), + (double2)(0x1.0be72e0000000p-1, 0x1.094aa0ada625ep-27), + (double2)(0x1.109f380000000p-1, 0x1.e2d4c96fde3ecp-25), + (double2)(0x1.154c3c0000000p-1, 0x1.2f4d5e9a98f34p-25), + (double2)(0x1.19ee6a0000000p-1, 0x1.467c96ecc5cbep-25), + (double2)(0x1.1e85f40000000p-1, 0x1.e7040d03dec5ap-25), + (double2)(0x1.23130c0000000p-1, 0x1.7bebf4282de36p-25), + (double2)(0x1.2795e00000000p-1, 0x1.289b11aeb783fp-25), + (double2)(0x1.2c0e9e0000000p-1, 0x1.a891d1772f538p-26), + (double2)(0x1.307d720000000p-1, 0x1.34f10be1fb591p-25), + (double2)(0x1.34e2880000000p-1, 0x1.d9ce1d316eb93p-25), + (double2)(0x1.393e0c0000000p-1, 0x1.3562a19a9c442p-25), + (double2)(0x1.3d90260000000p-1, 0x1.4e2adf548084cp-26), + (double2)(0x1.41d8fe0000000p-1, 0x1.08ce55cc8c97ap-26), + (double2)(0x1.4618bc0000000p-1, 0x1.0e2f613e85bdap-28), + (double2)(0x1.4a4f840000000p-1, 0x1.db03ebb0227bfp-25), + (double2)(0x1.4e7d800000000p-1, 0x1.1b75bb09cb098p-25), + (double2)(0x1.52a2d20000000p-1, 0x1.96f16abb9df22p-27), + (double2)(0x1.56bf9c0000000p-1, 0x1.5b3f399411c62p-25), + (double2)(0x1.5ad4040000000p-1, 0x1.86b3e59f65355p-26), + (double2)(0x1.5ee02a0000000p-1, 0x1.2482ceae1ac12p-26), + (double2)(0x1.62e42e0000000p-1, 0x1.efa39ef35793cp-25), +) +
diff --git a/amd-builtins/math64/logbD.cl b/amd-builtins/math64/logbD.cl new file mode 100644 index 0000000..895a621 --- /dev/null +++ b/amd-builtins/math64/logbD.cl
@@ -0,0 +1,36 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) double +logb(double x) +{ + long ax = as_long(x) & EXSIGNBIT_DP64; + double s = -1011L - clz(ax); + double r = (int)(ax >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; + r = ax >= PINFBITPATT_DP64 ? as_double(ax) : r; + r = ax < 0x0010000000000000L ? s : r; + r = ax == 0L ? as_double(NINFBITPATT_DP64) : r; + return r; +} +
diff --git a/amd-builtins/math64/madD.cl b/amd-builtins/math64/madD.cl new file mode 100644 index 0000000..5cb6d87 --- /dev/null +++ b/amd-builtins/math64/madD.cl
@@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) double +mad(double x, double y, double z) +{ + return __amdil_mad_f64(x, y, z); +} +
diff --git a/amd-builtins/math64/math64.h b/amd-builtins/math64/math64.h new file mode 100644 index 0000000..d6bae16 --- /dev/null +++ b/amd-builtins/math64/math64.h
@@ -0,0 +1,96 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef MATH64_H +#define MATH64_H 1 + +extern __attribute__((pure)) double __amdil_copysign_f64(double, double); +extern __attribute__((pure)) double __amdil_fma_f64(double, double, double); +extern __attribute__((pure)) double __amdil_mad_f64(double, double, double); +extern __attribute__((pure)) double __amdil_max_f64(double, double); +extern __attribute__((pure)) double __amdil_min_f64(double, double); +extern __attribute__((pure)) double __amdil_fraction_f64(double); +extern __attribute__((pure)) double __amdil_fabs_f64(double); +extern __attribute__((pure)) double __amdil_round_nearest_f64(double); +extern __attribute__((pure)) double __amdil_round_neginf_f64(double); +extern __attribute__((pure)) double __amdil_round_posinf_f64(double); +extern __attribute__((pure)) double __amdil_round_zero_f64(double); +extern __attribute__((pure)) double __amdil_rsq_f64(double); + +extern __attribute__((pure)) int __amdil_class_f64(double, int); + +#define SNAN 0x001 +#define QNAN 0x002 +#define NINF 0x004 +#define NNOR 0x008 +#define NSUB 0x010 +#define NZER 0x020 +#define PZER 0x040 +#define PSUB 0x080 +#define PNOR 0x100 +#define PINF 0x200 + +// Allow control over how division is done +#define MATH_DIVIDE(X,Y) ((X) / (Y)) +#define MATH_RECIP(X) (1.0 / (X)) + +// // Allow control over square root +#define MATH_SQRT(X) sqrt(X) + +// Table stuff +#define TABLE_SPACE __constant + +#define TABLE_MANGLE(NAME) __math64_##NAME + +#define USE_TABLE(TYPE,PTR,NAME) \ + extern TABLE_SPACE TYPE TABLE_MANGLE(NAME) []; \ + TABLE_SPACE TYPE * PTR = TABLE_MANGLE(NAME) + +#define DECLARE_TABLE(TYPE,NAME,LENGTH,...) \ + TABLE_SPACE TYPE TABLE_MANGLE(NAME) [ LENGTH ] = { __VA_ARGS__ }; + +/* Definitions for double functions on 64 bit machines */ +#define SIGNBIT_DP64 0x8000000000000000L +#define EXSIGNBIT_DP64 0x7fffffffffffffffL +#define EXPBITS_DP64 0x7ff0000000000000L +#define MANTBITS_DP64 0x000fffffffffffffL +#define ONEEXPBITS_DP64 0x3ff0000000000000L +#define TWOEXPBITS_DP64 0x4000000000000000L +#define HALFEXPBITS_DP64 0x3fe0000000000000L +#define IMPBIT_DP64 0x0010000000000000L +#define QNANBITPATT_DP64 0x7ff8000000000000L +#define INDEFBITPATT_DP64 0xfff8000000000000L +#define PINFBITPATT_DP64 0x7ff0000000000000L +#define NINFBITPATT_DP64 0xfff0000000000000L +#define EXPBIAS_DP64 1023 +#define EXPSHIFTBITS_DP64 52 +#define BIASEDEMIN_DP64 1 +#define EMIN_DP64 -1022 +#define BIASEDEMAX_DP64 2046 /* 0x7fe */ +#define EMAX_DP64 1023 /* 0x3ff */ +#define LAMBDA_DP64 1.0e300 +#define MANTLENGTH_DP64 53 +#define BASEDIGITS_DP64 15 + +#define ALIGNED(x) __attribute__((aligned(x))) + +#endif /* MATH64_H */ +
diff --git a/amd-builtins/math64/maxmagD.cl b/amd-builtins/math64/maxmagD.cl new file mode 100644 index 0000000..7a261cc --- /dev/null +++ b/amd-builtins/math64/maxmagD.cl
@@ -0,0 +1,38 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) double +maxmag(double x, double y) +{ + long ix = as_long(x); + long iy = as_long(y); + long ax = ix & 0x7fffffffffffffffL; + long ay = iy & 0x7fffffffffffffffL; + ax |= -(ax > 0x7ff0000000000000L); + ay |= -(ay > 0x7ff0000000000000L); + return as_double((-(ax > ay) & ix) | + (-(ay > ax) & iy) | + (-(ax == ay) & ((ix & iy) | (ax & 0x0008000000000000L)))); +} +
diff --git a/amd-builtins/math64/minmagD.cl b/amd-builtins/math64/minmagD.cl new file mode 100644 index 0000000..e357071 --- /dev/null +++ b/amd-builtins/math64/minmagD.cl
@@ -0,0 +1,36 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) double +minmag(double x, double y) +{ + long ix = as_long(x); + long iy = as_long(y); + long ax = ix & 0x7fffffffffffffffL; + long ay = iy & 0x7fffffffffffffffL; + return as_double((-(ax < ay) & ix) | + (-(ay < ax) & iy) | + (-(ax == ay) & (ix | iy))); +} +
diff --git a/amd-builtins/math64/modfD.cl b/amd-builtins/math64/modfD.cl new file mode 100644 index 0000000..2c21a8a --- /dev/null +++ b/amd-builtins/math64/modfD.cl
@@ -0,0 +1,67 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) double +modf(double x, double *iptr) +{ + long ux = as_long(x); + int e = ((int)(ux >> 52) & 0x7ff) - 1023; + long s = ux & 0x8000000000000000L; + long msk = 0xffffffffffffffffL << (52 - e); + long i = msk & ux; + long r = as_long(x - as_double(i)); + + r = e < 0 ? ux : r; + i = e < 0 ? s : i; + + r = e >= 52 ? s : r; + i = e >= 52 ? ux : i; + + r = (ux & 0x7fffffffffffffffL) > 0x7ff0000000000000L ? ux : r; + + *iptr = as_double(i); + return as_double(r); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline)) double +modf(double x, __global double *iptr) +{ + double i; + double f = modf(x, &i); + *iptr = i; + return f; +} + +__attribute__((overloadable, always_inline)) double +modf(double x, __local double *iptr) +{ + double i; + double f = modf(x, &i); + *iptr = i; + return f; +} +#endif +
diff --git a/amd-builtins/math64/nanD.cl b/amd-builtins/math64/nanD.cl new file mode 100644 index 0000000..394e807 --- /dev/null +++ b/amd-builtins/math64/nanD.cl
@@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) double +nan(ulong nancode) +{ + return as_double((nancode & 0x000fffffffffffffUL) | 0x7ff8000000000000UL); +} +
diff --git a/amd-builtins/math64/nextafterD.cl b/amd-builtins/math64/nextafterD.cl new file mode 100644 index 0000000..6863004 --- /dev/null +++ b/amd-builtins/math64/nextafterD.cl
@@ -0,0 +1,44 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) double +nextafter(double x, double y) +{ + long ix = as_long(x); + long ax = ix & 0x7fffffffffffffffL; + long mx = 0x8000000000000000L - ix; + mx = ix < 0 ? mx : ix; + long iy = as_long(y); + long ay = iy & 0x7fffffffffffffffL; + long my = 0x8000000000000000L - iy; + my = iy < 0 ? my : iy; + long t = mx + (mx < my ? 1 : -1); + long r = 0x8000000000000000L - t; + r = t < 0 ? r : t; + r = ax > 0x7ff0000000000000L ? ix : r; + r = ay > 0x7ff0000000000000L ? iy : r; + r = (ax|ay) == 0L | ix == iy ? iy : r; + return as_double(r); +} +
diff --git a/amd-builtins/math64/pibits64.h b/amd-builtins/math64/pibits64.h new file mode 100644 index 0000000..e383a54 --- /dev/null +++ b/amd-builtins/math64/pibits64.h
@@ -0,0 +1,37 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +DECLARE_TABLE(uchar, PIBITS,, + 224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175, + 169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31, + 235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38, + 44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188, + 188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247, + 46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249, + 125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8, + 162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39, + 168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21, + 239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109, + 3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13, + 230, 139, 2, 0, 0, 0, 0, 0, 0, 0 +) +
diff --git a/amd-builtins/math64/powD.cl b/amd-builtins/math64/powD.cl new file mode 100644 index 0000000..e078d87 --- /dev/null +++ b/amd-builtins/math64/powD.cl
@@ -0,0 +1,24 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "powD_base.h" +
diff --git a/amd-builtins/math64/powD_base.h b/amd-builtins/math64/powD_base.h new file mode 100644 index 0000000..ec34bad --- /dev/null +++ b/amd-builtins/math64/powD_base.h
@@ -0,0 +1,280 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline, weak)) double +#if defined(COMPILING_POWR) +powr(double x, double y) +#elif defined(COMPILING_POWN) +pown(double x, int ny) +#elif defined(COMPILING_ROOTN) +rootn(double x, int ny) +#else +pow(double x, double y) +#endif +{ + const double real_log2_tail = 5.76999904754328540596e-08; + const double real_log2_lead = 6.93147122859954833984e-01; + + USE_TABLE(double2, p_powlog_tbl, POWLOG_TBL); + USE_TABLE(double2, p_log_F_inv_tbl, LOG_F_INV_TBL); + USE_TABLE(double2, p_two_to, TWO_TO_JBY64_EP); + +#if defined(COMPILING_POWN) + double y = (double) ny; +#elif defined(COMPILING_ROOTN) + double dny = (double)ny; + double y = 1.0 / dny; +#endif + + long ux = as_long(x); + long ax = ux & (~SIGNBIT_DP64); + int xpos = ax == ux; + + long uy = as_long(y); + long ay = uy & (~SIGNBIT_DP64); + int ypos = ay == uy; + + // Extended precision log + double v, vt; + { + int exp = (int)(ax >> 52) - 1023; + int mask_exp_1023 = exp == -1023; + double xexp = (double) exp; + long mantissa = ax & 0x000FFFFFFFFFFFFFL; + + long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0); + exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045; + double xexp1 = (double) exp; + long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL; + + xexp = mask_exp_1023 ? xexp1 : xexp; + mantissa = mask_exp_1023 ? mantissa1 : mantissa; + + long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1); + int index = rax >> 44; + + double F = as_double(rax | 0x3FE0000000000000L); + double Y = as_double(mantissa | 0x3FE0000000000000L); + double f = F - Y; + double2 tv = p_log_F_inv_tbl[index]; + double log_h = tv.s0; + double log_t = tv.s1; + double f_inv = (log_h + log_t) * f; + double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L); + double r2 = fma(-F, r1, f) * (log_h + log_t); + double r = r1 + r2; + + double poly = fma(r, + fma(r, + fma(r, + fma(r, 1.0/7.0, 1.0/6.0), + 1.0/5.0), + 1.0/4.0), + 1.0/3.0); + poly = poly * r * r * r; + + double hr1r1 = 0.5*r1*r1; + double poly0h = r1 + hr1r1; + double poly0t = r1 - poly0h + hr1r1; + poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t; + + tv = p_powlog_tbl[index]; + log_h = tv.s0; + log_t = tv.s1; + + double resT_t = fma(xexp, real_log2_tail, + log_t) - poly; + double resT = resT_t - poly0h; + double resH = fma(xexp, real_log2_lead, log_h); + double resT_h = poly0h; + + double H = resT + resH; + double H_h = as_double(as_long(H) & 0xfffffffff8000000L); + double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h); + H = H_h; + + double y_head = as_double(uy & 0xfffffffff8000000L); + double y_tail = y - y_head; + +#if defined(COMPILING_POWN) + int mask_2_24 = ay > 0x4170000000000000; // 2^24 + int nyh = convert_int(y_head); + int nyt = ny - nyh; + double y_tail1 = (double)nyt; + y_tail = mask_2_24 ? y_tail1 : y_tail; +#endif + +#if defined(COMPILING_ROOTN) + double fnyh = as_double(as_long(dny) & 0xfffffffffff00000); + double fnyt = (double)(ny - (int)fnyh); + y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0))/ dny; +#endif + + double temp = fma(y_tail, H, fma(y_head, T, y_tail*T)); + v = fma(y_head, H, temp); + vt = fma(y_head, H, -v) + temp; + } + + // Now calculate exp of (v,vt) + + double expv; + { + const double max_exp_arg = 709.782712893384; + const double min_exp_arg = -745.1332191019411; + const double sixtyfour_by_lnof2 = 92.33248261689366; + const double lnof2_by_64_head = 0.010830424260348081; + const double lnof2_by_64_tail = -4.359010638708991e-10; + + double temp = v * sixtyfour_by_lnof2; + int n = (int)temp; + double dn = (double)n; + int j = n & 0x0000003f; + int m = n >> 6; + + double2 tv = p_two_to[j]; + double f1 = tv.s0; + double f2 = tv.s1; + double f = f1 + f2; + + double r1 = fma(dn, -lnof2_by_64_head, v); + double r2 = dn * lnof2_by_64_tail; + double r = (r1 + r2) + vt; + + double q = fma(r, + fma(r, + fma(r, + fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03), + 4.16666666662260795726e-02), + 1.66666666665260878863e-01), + 5.00000000000000008883e-01); + q = fma(r*r, q, r); + + expv = fma(f, q, f2) + f1; + expv = ldexp(expv, m); + + expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv; + expv = v < min_exp_arg ? 0.0 : expv; + } + + // See whether y is an integer. + // inty = 0 means not an integer. + // inty = 1 means odd integer. + // inty = 2 means even integer. + +#if defined(COMPILING_POWN) | defined(COMPILING_ROOTN) + int inty = 2 - (ny & 1); +#else + int inty; + { + int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1; + inty = yexp < 1 ? 0 : 2; + inty = yexp > 53 ? 2 : inty; + long mask = (1L << (53 - yexp)) - 1L; + int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2; + inty1 = (ay & mask) != 0 ? 0 : inty1; + inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty; + } +#endif + + expv *= inty == 1 & !xpos ? -1.0 : 1.0; + + long ret = as_long(expv); + + // Now all the edge cases + +#if defined COMPILING_POWR + ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret; + ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret; + ret = ax == 0x3ff0000000000000L & ay < PINFBITPATT_DP64 ? 0x3ff0000000000000L : ret; + ret = ax == 0x3ff0000000000000L & ay == PINFBITPATT_DP64 ? QNANBITPATT_DP64 : ret; + ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret; + ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret; + ret = ux < PINFBITPATT_DP64 & ay == 0L ? 0x3ff0000000000000L : ret; + ret = ax == PINFBITPATT_DP64 & !ypos ? 0L : ret; + ret = ax == PINFBITPATT_DP64 & ypos ? PINFBITPATT_DP64 : ret; + ret = ax == PINFBITPATT_DP64 & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret; + ret = ax == PINFBITPATT_DP64 & ay == 0L ? QNANBITPATT_DP64 : ret; + ret = ax == 0L & !ypos ? PINFBITPATT_DP64 : ret; + ret = ax == 0L & ypos ? 0L : ret; + ret = ax == 0L & ay == 0L ? QNANBITPATT_DP64 : ret; + ret = ax != 0L & !xpos ? QNANBITPATT_DP64 : ret; + ret = ax > PINFBITPATT_DP64 ? ux : ret; + ret = ay > PINFBITPATT_DP64 ? uy : ret; +#elif defined COMPILING_POWN + long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64; + ret = ax == 0L & !ypos & inty == 1 ? xinf : ret; + ret = ax == 0L & !ypos & inty == 2 ? PINFBITPATT_DP64 : ret; + ret = ax == 0L & ypos & inty == 2 ? 0L : ret; + long xzero = !xpos ? 0x8000000000000000L : 0L; + ret = ax == 0L & ypos & inty == 1 ? xzero : ret; + ret = ux == NINFBITPATT_DP64 & !ypos & inty == 1 ? 0x8000000000000000L : ret; + ret = ux == NINFBITPATT_DP64 & !ypos & inty != 1 ? 0L : ret; + ret = ux == NINFBITPATT_DP64 & ypos & inty == 1 ? NINFBITPATT_DP64 : ret; + ret = ux == NINFBITPATT_DP64 & ypos & inty != 1 ? PINFBITPATT_DP64 : ret; + ret = ux == PINFBITPATT_DP64 & !ypos ? 0L : ret; + ret = ux == PINFBITPATT_DP64 & ypos ? PINFBITPATT_DP64 : ret; + ret = ax > PINFBITPATT_DP64 ? ux : ret; + ret = ny == 0 ? 0x3ff0000000000000L : ret; +#elif defined COMPILING_ROOTN + ret = !xpos & inty == 2 ? QNANBITPATT_DP64 : ret; + long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64; + ret = ax == 0L & !ypos & inty == 1 ? xinf : ret; + ret = ax == 0L & !ypos & inty == 2 ? PINFBITPATT_DP64 : ret; + ret = ax == 0L & ypos & inty == 2 ? 0L : ret; + long xzero = xpos ? 0L : 0x8000000000000000L; + ret = ax == 0L & ypos & inty == 1 ? xzero : ret; + ret = ux == NINFBITPATT_DP64 & ypos & inty == 1 ? NINFBITPATT_DP64 : ret; + ret = ux == NINFBITPATT_DP64 & !ypos & inty == 1 ? 0x8000000000000000L : ret; + ret = ux == PINFBITPATT_DP64 & !ypos ? 0L : ret; + ret = ux == PINFBITPATT_DP64 & ypos ? PINFBITPATT_DP64 : ret; + ret = ax > PINFBITPATT_DP64 ? ux : ret; + ret = ny == 0 ? QNANBITPATT_DP64 : ret; +#else + ret = !xpos & inty == 0 ? QNANBITPATT_DP64 : ret; + ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret; + ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret; + ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret; + ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret; + long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64; + ret = ax == 0L & !ypos & inty == 1 ? xinf : ret; + ret = ax == 0L & !ypos & inty != 1 ? PINFBITPATT_DP64 : ret; + long xzero = xpos ? 0L : 0x8000000000000000L; + ret = ax == 0L & ypos & inty == 1 ? xzero : ret; + ret = ax == 0L & ypos & inty != 1 ? 0L : ret; + ret = ax == 0L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret; + ret = ux == 0xbff0000000000000L & ay == PINFBITPATT_DP64 ? 0x3ff0000000000000L : ret; + ret = ux == NINFBITPATT_DP64 & !ypos & inty == 1 ? 0x8000000000000000L : ret; + ret = ux == NINFBITPATT_DP64 & !ypos & inty != 1 ? 0L : ret; + ret = ux == NINFBITPATT_DP64 & ypos & inty == 1 ? NINFBITPATT_DP64 : ret; + ret = ux == NINFBITPATT_DP64 & ypos & inty != 1 ? PINFBITPATT_DP64 : ret; + ret = ux == PINFBITPATT_DP64 & !ypos ? 0L : ret; + ret = ux == PINFBITPATT_DP64 & ypos ? PINFBITPATT_DP64 : ret; + ret = ax > PINFBITPATT_DP64 ? ux : ret; + ret = ay > PINFBITPATT_DP64 ? uy : ret; + ret = ay == 0L ? 0x3ff0000000000000L : ret; + ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret; +#endif + + return as_double(ret); +} +
diff --git a/amd-builtins/math64/powD_table.h b/amd-builtins/math64/powD_table.h new file mode 100644 index 0000000..95c9d5d --- /dev/null +++ b/amd-builtins/math64/powD_table.h
@@ -0,0 +1,544 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + + +DECLARE_TABLE(double2, POWLOG_TBL, 258, + (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.ff00aa0000000p-9, 0x1.5885e0250435ap-36), + (double2)(0x1.fe02a60000000p-8, 0x1.620cf11f86ed2p-33), + (double2)(0x1.7dc4750000000p-7, 0x1.f0214edba4a25p-32), + (double2)(0x1.fc0a8b0000000p-7, 0x1.f807c79f3db4ep-36), + (double2)(0x1.3cea440000000p-6, 0x1.a352ba779a52bp-33), + (double2)(0x1.7b91b00000000p-6, 0x1.f56c46aa49fd5p-32), + (double2)(0x1.b9fc020000000p-6, 0x1.ebe465fef5196p-32), + (double2)(0x1.f829b00000000p-6, 0x1.cf0660099f1f8p-31), + (double2)(0x1.1b0d980000000p-5, 0x1.247b2ff85945dp-30), + (double2)(0x1.39e87b0000000p-5, 0x1.3fd7abf5202b6p-30), + (double2)(0x1.58a5ba0000000p-5, 0x1.f91c9a918d51ep-30), + (double2)(0x1.77458f0000000p-5, 0x1.8cb73f118d3cap-31), + (double2)(0x1.95c8300000000p-5, 0x1.d91c7d6fad074p-30), + (double2)(0x1.b42dd70000000p-5, 0x1.1971bec28d14cp-33), + (double2)(0x1.d276b80000000p-5, 0x1.5b616a423c78ap-30), + (double2)(0x1.f0a30c0000000p-5, 0x1.162a6617cc971p-37), + (double2)(0x1.0759830000000p-4, 0x1.66391c4c06d29p-30), + (double2)(0x1.16536e0000000p-4, 0x1.d46f5c1d0c4b8p-29), + (double2)(0x1.253f620000000p-4, 0x1.e14282df1f6d3p-29), + (double2)(0x1.341d790000000p-4, 0x1.86f47424a660dp-30), + (double2)(0x1.42edcb0000000p-4, 0x1.d4c8de077753ep-29), + (double2)(0x1.51b0730000000p-4, 0x1.e0c307ed24f1cp-29), + (double2)(0x1.60658a0000000p-4, 0x1.26ea18763bdd3p-29), + (double2)(0x1.6f0d280000000p-4, 0x1.5cad69737c933p-29), + (double2)(0x1.7da7660000000p-4, 0x1.af62599088901p-29), + (double2)(0x1.8c345d0000000p-4, 0x1.8c66c83d6b2d0p-30), + (double2)(0x1.9ab4240000000p-4, 0x1.880ceb36fb30fp-30), + (double2)(0x1.a926d30000000p-4, 0x1.495aac6ca17a4p-29), + (double2)(0x1.b78c820000000p-4, 0x1.761db4210878cp-29), + (double2)(0x1.c5e5480000000p-4, 0x1.eb78e862bac2fp-29), + (double2)(0x1.d4313d0000000p-4, 0x1.9b2cd75790dd9p-30), + (double2)(0x1.e270760000000p-4, 0x1.c55e5cbd3d50fp-29), + (double2)(0x1.f0a30c0000000p-4, 0x1.162a6617cc971p-36), + (double2)(0x1.fec9130000000p-4, 0x1.dbeabaaa2e519p-32), + (double2)(0x1.0671510000000p-3, 0x1.652cb7150c647p-30), + (double2)(0x1.0d77e70000000p-3, 0x1.9a11cb2cd2ee2p-28), + (double2)(0x1.1478580000000p-3, 0x1.19d0ab1a28813p-29), + (double2)(0x1.1b72ad0000000p-3, 0x1.4bd9e80a41811p-29), + (double2)(0x1.2266f10000000p-3, 0x1.214b596faa3dfp-28), + (double2)(0x1.29552f0000000p-3, 0x1.03fea46980bb8p-28), + (double2)(0x1.303d710000000p-3, 0x1.1c8ffa5fd28c7p-28), + (double2)(0x1.371fc20000000p-3, 0x1.e8f743bcd96c5p-35), + (double2)(0x1.3dfc2b0000000p-3, 0x1.d98c5395315c6p-32), + (double2)(0x1.44d2b60000000p-3, 0x1.996fa3ccfa7b2p-28), + (double2)(0x1.4ba36f0000000p-3, 0x1.cd2af2ad13037p-30), + (double2)(0x1.526e5e0000000p-3, 0x1.d0da1bd17200ep-30), + (double2)(0x1.59338d0000000p-3, 0x1.330410ba68b75p-28), + (double2)(0x1.5ff3070000000p-3, 0x1.4f27a790e7c41p-32), + (double2)(0x1.66acd40000000p-3, 0x1.3956a86f6ff1bp-30), + (double2)(0x1.6d60fe0000000p-3, 0x1.c6748723551d9p-29), + (double2)(0x1.740f8f0000000p-3, 0x1.500de9326cdfcp-29), + (double2)(0x1.7ab8900000000p-3, 0x1.086c848df1b59p-30), + (double2)(0x1.815c0a0000000p-3, 0x1.4357ead6836ffp-31), + (double2)(0x1.87fa060000000p-3, 0x1.4832442408024p-29), + (double2)(0x1.8e928d0000000p-3, 0x1.d10da8154b13dp-28), + (double2)(0x1.9525a90000000p-3, 0x1.9e8ad68ec8260p-28), + (double2)(0x1.9bb3620000000p-3, 0x1.cfbf706abaf18p-28), + (double2)(0x1.a23bc10000000p-3, 0x1.fc56ac6326e23p-28), + (double2)(0x1.a8becf0000000p-3, 0x1.9105e3185cf21p-28), + (double2)(0x1.af3c940000000p-3, 0x1.d017fe5b19cc0p-28), + (double2)(0x1.b5b5190000000p-3, 0x1.d1f6b48dd13fep-28), + (double2)(0x1.bc28670000000p-3, 0x1.0b63358a7e73ap-29), + (double2)(0x1.c296850000000p-3, 0x1.63063028c211cp-29), + (double2)(0x1.c8ff7c0000000p-3, 0x1.e6a6886b09760p-29), + (double2)(0x1.cf63540000000p-3, 0x1.c138bb891cd03p-28), + (double2)(0x1.d5c2160000000p-3, 0x1.69f7722b7221ap-28), + (double2)(0x1.dc1bca0000000p-3, 0x1.57d8fac1a628cp-32), + (double2)(0x1.e270760000000p-3, 0x1.c55e5cbd3d50fp-28), + (double2)(0x1.e8c0250000000p-3, 0x1.552d2ff48fe2ep-30), + (double2)(0x1.ef0adc0000000p-3, 0x1.7b8b26ca431bcp-28), + (double2)(0x1.f550a50000000p-3, 0x1.92decdc1c5f6dp-29), + (double2)(0x1.fb91860000000p-3, 0x1.abc7c551aaa8cp-28), + (double2)(0x1.00e6c40000000p-2, 0x1.6b540731a354bp-28), + (double2)(0x1.0402590000000p-2, 0x1.2d341036b89efp-28), + (double2)(0x1.071b850000000p-2, 0x1.f9ab21a3a2e0fp-27), + (double2)(0x1.0a324e0000000p-2, 0x1.39c871afb9fbdp-29), + (double2)(0x1.0d46b50000000p-2, 0x1.e6add2c81f640p-28), + (double2)(0x1.1058bf0000000p-2, 0x1.35c95aa313f41p-27), + (double2)(0x1.1368700000000p-2, 0x1.49d4582f6cc53p-29), + (double2)(0x1.1675ca0000000p-2, 0x1.7574c1c07398fp-27), + (double2)(0x1.1980d20000000p-2, 0x1.ba846dece9e8dp-27), + (double2)(0x1.1c898c0000000p-2, 0x1.6999fafbc68e7p-30), + (double2)(0x1.1f8ff90000000p-2, 0x1.c9145e51b0103p-27), + (double2)(0x1.22941f0000000p-2, 0x1.79ef2cb44850ap-27), + (double2)(0x1.2596010000000p-2, 0x1.beec73de11275p-31), + (double2)(0x1.2895a10000000p-2, 0x1.ef4351af5a498p-29), + (double2)(0x1.2b93030000000p-2, 0x1.5713a493b4a50p-27), + (double2)(0x1.2e8e2b0000000p-2, 0x1.5c23a61385992p-27), + (double2)(0x1.31871c0000000p-2, 0x1.2a88309f57299p-27), + (double2)(0x1.347dd90000000p-2, 0x1.530faa9ac8acep-27), + (double2)(0x1.3772660000000p-2, 0x1.5fec2d792a758p-29), + (double2)(0x1.3a64c50000000p-2, 0x1.5a517a71cbcd7p-28), + (double2)(0x1.3d54fa0000000p-2, 0x1.707dc3e1cd9a3p-28), + (double2)(0x1.4043080000000p-2, 0x1.a1a9f8ef43049p-28), + (double2)(0x1.432ef20000000p-2, 0x1.409d0276b3674p-27), + (double2)(0x1.4618bc0000000p-2, 0x1.0e2f613e85bd9p-29), + (double2)(0x1.4900680000000p-2, 0x1.0027433001e5fp-32), + (double2)(0x1.4be5f90000000p-2, 0x1.5dde2836d3265p-28), + (double2)(0x1.4ec9730000000p-2, 0x1.300134d7aaf04p-29), + (double2)(0x1.51aad80000000p-2, 0x1.cb7e0b42724f5p-28), + (double2)(0x1.548a2c0000000p-2, 0x1.d6e93167e6308p-29), + (double2)(0x1.5767710000000p-2, 0x1.d1569b1526adbp-28), + (double2)(0x1.5a42ab0000000p-2, 0x1.e99fc338a1a41p-31), + (double2)(0x1.5d1bdb0000000p-2, 0x1.eb01394a11b1cp-27), + (double2)(0x1.5ff3070000000p-2, 0x1.4f27a790e7c41p-31), + (double2)(0x1.62c82f0000000p-2, 0x1.5ce3ca97b7af9p-29), + (double2)(0x1.659b570000000p-2, 0x1.81f0f940ed857p-29), + (double2)(0x1.686c810000000p-2, 0x1.d36295d88857cp-27), + (double2)(0x1.6b3bb20000000p-2, 0x1.1aca1ec4af526p-29), + (double2)(0x1.6e08ea0000000p-2, 0x1.45743c7182726p-27), + (double2)(0x1.70d42e0000000p-2, 0x1.3c491aead337ep-29), + (double2)(0x1.739d7f0000000p-2, 0x1.aef401a738931p-28), + (double2)(0x1.7664e10000000p-2, 0x1.1cede76092a29p-29), + (double2)(0x1.792a550000000p-2, 0x1.fba8f44f82bb4p-27), + (double2)(0x1.7bede00000000p-2, 0x1.46f5f7f3c3e1ap-27), + (double2)(0x1.7eaf830000000p-2, 0x1.7055f86c9674bp-27), + (double2)(0x1.816f410000000p-2, 0x1.b41a92b6b6e1ap-27), + (double2)(0x1.842d1d0000000p-2, 0x1.43d162e927628p-27), + (double2)(0x1.86e9190000000p-2, 0x1.466174013f9b1p-27), + (double2)(0x1.89a3380000000p-2, 0x1.b05096ad69c62p-28), + (double2)(0x1.8c5b7c0000000p-2, 0x1.0b169150faa58p-27), + (double2)(0x1.8f11e80000000p-2, 0x1.cd98b1df85da7p-28), + (double2)(0x1.91c67e0000000p-2, 0x1.68b507b0f8fa8p-27), + (double2)(0x1.9479410000000p-2, 0x1.8422df57499bap-27), + (double2)(0x1.972a340000000p-2, 0x1.1351586970274p-30), + (double2)(0x1.99d9580000000p-2, 0x1.17e08acba92eep-30), + (double2)(0x1.9c86b00000000p-2, 0x1.6e04314dd0229p-29), + (double2)(0x1.9f323e0000000p-2, 0x1.97f3097e56d1ap-27), + (double2)(0x1.a1dc060000000p-2, 0x1.356e655901286p-28), + (double2)(0x1.a484090000000p-2, 0x1.cb761457f94d6p-31), + (double2)(0x1.a72a490000000p-2, 0x1.9af67a85a9dacp-28), + (double2)(0x1.a9cec90000000p-2, 0x1.53410931a909fp-27), + (double2)(0x1.ac718c0000000p-2, 0x1.2c587206058f5p-29), + (double2)(0x1.af12930000000p-2, 0x1.23bc358899c22p-29), + (double2)(0x1.b1b1e00000000p-2, 0x1.d7bf8b6d223cbp-27), + (double2)(0x1.b44f770000000p-2, 0x1.7991ec5197ddbp-27), + (double2)(0x1.b6eb590000000p-2, 0x1.a79e6bb3a9219p-27), + (double2)(0x1.b985890000000p-2, 0x1.a4c43ed663ec5p-28), + (double2)(0x1.bc1e080000000p-2, 0x1.61b5a1484f438p-27), + (double2)(0x1.beb4d90000000p-2, 0x1.b4e36f7ef0c3ap-27), + (double2)(0x1.c149ff0000000p-2, 0x1.15f026acd0d1bp-30), + (double2)(0x1.c3dd7a0000000p-2, 0x1.f36b535cecf05p-28), + (double2)(0x1.c66f4e0000000p-2, 0x1.ffb7fbf3eb5c6p-29), + (double2)(0x1.c8ff7c0000000p-2, 0x1.e6a6886b09760p-28), + (double2)(0x1.cb8e070000000p-2, 0x1.135eb27f5bbc3p-28), + (double2)(0x1.ce1af00000000p-2, 0x1.70be7d6f6fa57p-27), + (double2)(0x1.d0a63a0000000p-2, 0x1.ce43cc84ab338p-27), + (double2)(0x1.d32fe70000000p-2, 0x1.c01d7aac3bd91p-27), + (double2)(0x1.d5b7f90000000p-2, 0x1.5c58d07961060p-27), + (double2)(0x1.d83e720000000p-2, 0x1.628bcf941456ep-28), + (double2)(0x1.dac3530000000p-2, 0x1.c58b2a8461cd2p-27), + (double2)(0x1.dd46a00000000p-2, 0x1.3071282fb989ap-28), + (double2)(0x1.dfc8590000000p-2, 0x1.20dab6a80f09cp-27), + (double2)(0x1.e248810000000p-2, 0x1.4f8d84c397b1ep-27), + (double2)(0x1.e4c71a0000000p-2, 0x1.0d0ee08599e48p-27), + (double2)(0x1.e744260000000p-2, 0x1.d68787e37da36p-30), + (double2)(0x1.e9bfa60000000p-2, 0x1.66187d591bafcp-28), + (double2)(0x1.ec399d0000000p-2, 0x1.2346600bae772p-29), + (double2)(0x1.eeb20c0000000p-2, 0x1.90377d0d61b8ep-28), + (double2)(0x1.f128f50000000p-2, 0x1.f5e0dd966b907p-27), + (double2)(0x1.f39e5b0000000p-2, 0x1.9023cb79a00e2p-27), + (double2)(0x1.f6123f0000000p-2, 0x1.4e05158c28ad8p-27), + (double2)(0x1.f884a30000000p-2, 0x1.bfa7b08b18ae4p-28), + (double2)(0x1.faf5880000000p-2, 0x1.ef1e63db35f67p-27), + (double2)(0x1.fd64f20000000p-2, 0x1.ec2ae39493d4fp-31), + (double2)(0x1.ffd2e00000000p-2, 0x1.0afe930ab2fa0p-27), + (double2)(0x1.011fab0000000p-1, 0x1.25ff8a1810dd4p-29), + (double2)(0x1.02552a0000000p-1, 0x1.69743fb1a71a5p-27), + (double2)(0x1.0389ee0000000p-1, 0x1.f9cc676785571p-26), + (double2)(0x1.04bdf90000000p-1, 0x1.b524da4cbf982p-26), + (double2)(0x1.05f14b0000000p-1, 0x1.a4c8b381535b8p-26), + (double2)(0x1.0723e50000000p-1, 0x1.839be809caf2cp-26), + (double2)(0x1.0855c80000000p-1, 0x1.0968a1cb82c13p-26), + (double2)(0x1.0986f40000000p-1, 0x1.eae6a41723fb5p-26), + (double2)(0x1.0ab76b0000000p-1, 0x1.d9c29a380a4dbp-26), + (double2)(0x1.0be72e0000000p-1, 0x1.094aa0ada625ep-27), + (double2)(0x1.0d163c0000000p-1, 0x1.973ad6fc108cap-26), + (double2)(0x1.0e44980000000p-1, 0x1.747322fdbab97p-27), + (double2)(0x1.0f72410000000p-1, 0x1.93692fa9d4221p-26), + (double2)(0x1.109f390000000p-1, 0x1.c5a992dfbc7d9p-26), + (double2)(0x1.11cb810000000p-1, 0x1.e1f33e102387ap-27), + (double2)(0x1.12f7190000000p-1, 0x1.64fbef14c048cp-27), + (double2)(0x1.1422020000000p-1, 0x1.490f513ca5e3bp-27), + (double2)(0x1.154c3d0000000p-1, 0x1.7a6af4d4c799dp-28), + (double2)(0x1.1675ca0000000p-1, 0x1.7574c1c07398fp-26), + (double2)(0x1.179eab0000000p-1, 0x1.7b133417f8c1cp-26), + (double2)(0x1.18c6e00000000p-1, 0x1.feb9e0c176514p-26), + (double2)(0x1.19ee6b0000000p-1, 0x1.19f25bb3172f7p-27), + (double2)(0x1.1b154b0000000p-1, 0x1.5f68a7bbfb852p-27), + (double2)(0x1.1c3b810000000p-1, 0x1.ee278497929f1p-26), + (double2)(0x1.1d610f0000000p-1, 0x1.ccee006109d58p-26), + (double2)(0x1.1e85f50000000p-1, 0x1.ce081a07bd8b3p-26), + (double2)(0x1.1faa340000000p-1, 0x1.70e12981817b8p-26), + (double2)(0x1.20cdcd0000000p-1, 0x1.92ab6d93503d0p-29), + (double2)(0x1.21f0bf0000000p-1, 0x1.8cb7dd7c3b61ep-26), + (double2)(0x1.23130d0000000p-1, 0x1.efafd0a0b78dap-27), + (double2)(0x1.2434b60000000p-1, 0x1.e907267c4288ep-26), + (double2)(0x1.2555bc0000000p-1, 0x1.d31ef96780875p-26), + (double2)(0x1.2676200000000p-1, 0x1.3430dfcd2ad50p-29), + (double2)(0x1.2795e10000000p-1, 0x1.44d88d75bc1f9p-28), + (double2)(0x1.28b5000000000p-1, 0x1.bec0f055e04fcp-26), + (double2)(0x1.29d37f0000000p-1, 0x1.d85611590b9adp-26), + (double2)(0x1.2af15f0000000p-1, 0x1.320568e583229p-32), + (double2)(0x1.2c0e9e0000000p-1, 0x1.a891d1772f538p-26), + (double2)(0x1.2d2b400000000p-1, 0x1.2edc9dabba74dp-29), + (double2)(0x1.2e47430000000p-1, 0x1.b9009a1015086p-27), + (double2)(0x1.2f62a90000000p-1, 0x1.2a12a8c5b1a19p-26), + (double2)(0x1.307d730000000p-1, 0x1.a7885f0fdac85p-28), + (double2)(0x1.3197a00000000p-1, 0x1.f4ffcd43ac691p-26), + (double2)(0x1.32b1330000000p-1, 0x1.2243ae2640aadp-26), + (double2)(0x1.33ca2b0000000p-1, 0x1.46513299035d3p-26), + (double2)(0x1.34e2890000000p-1, 0x1.b39c3a62dd725p-26), + (double2)(0x1.35fa4e0000000p-1, 0x1.ba6dd40049f51p-26), + (double2)(0x1.37117b0000000p-1, 0x1.51d1ed7177409p-27), + (double2)(0x1.38280f0000000p-1, 0x1.cb0f2fd7f5216p-26), + (double2)(0x1.393e0d0000000p-1, 0x1.ab150cd4e2213p-28), + (double2)(0x1.3a53730000000p-1, 0x1.cfd7bf3193844p-26), + (double2)(0x1.3b68440000000p-1, 0x1.3fff8455f1dbdp-26), + (double2)(0x1.3c7c7f0000000p-1, 0x1.fee640b905fc9p-26), + (double2)(0x1.3d90260000000p-1, 0x1.4e2adf548084cp-26), + (double2)(0x1.3ea3390000000p-1, 0x1.b597adc1ecdd2p-28), + (double2)(0x1.3fb5b80000000p-1, 0x1.345bd096d3a75p-27), + (double2)(0x1.40c7a40000000p-1, 0x1.101b9d2453c8bp-26), + (double2)(0x1.41d8fe0000000p-1, 0x1.08ce55cc8c979p-26), + (double2)(0x1.42e9c60000000p-1, 0x1.bbf017e595f71p-26), + (double2)(0x1.43f9fe0000000p-1, 0x1.7ce733bd393dcp-28), + (double2)(0x1.4509a50000000p-1, 0x1.33bb0a503f8a1p-29), + (double2)(0x1.4618bc0000000p-1, 0x1.0e2f613e85bd9p-28), + (double2)(0x1.4727430000000p-1, 0x1.e67555a635b3cp-26), + (double2)(0x1.48353d0000000p-1, 0x1.ea88df73d5e8bp-29), + (double2)(0x1.4942a80000000p-1, 0x1.d17e03bda18a8p-28), + (double2)(0x1.4a4f850000000p-1, 0x1.b607d76044f7ep-26), + (double2)(0x1.4b5bd60000000p-1, 0x1.2adc4e71bc2fcp-26), + (double2)(0x1.4c679a0000000p-1, 0x1.f99dc7362d1d9p-26), + (double2)(0x1.4d72d30000000p-1, 0x1.473fa008e6a6ap-26), + (double2)(0x1.4e7d810000000p-1, 0x1.b75bb09cb0985p-29), + (double2)(0x1.4f87a30000000p-1, 0x1.ea04dd10b9abap-26), + (double2)(0x1.50913c0000000p-1, 0x1.802d0d6979674p-26), + (double2)(0x1.519a4c0000000p-1, 0x1.74688ccd99094p-30), + (double2)(0x1.52a2d20000000p-1, 0x1.96f16abb9df22p-27), + (double2)(0x1.53aad00000000p-1, 0x1.6e66df2aa374fp-27), + (double2)(0x1.54b2460000000p-1, 0x1.e66525ea4550ap-27), + (double2)(0x1.55b9350000000p-1, 0x1.2d02f34f20cbdp-27), + (double2)(0x1.56bf9d0000000p-1, 0x1.6cfce65047188p-27), + (double2)(0x1.57c57f0000000p-1, 0x1.9b78c842d58b8p-28), + (double2)(0x1.58cadb0000000p-1, 0x1.735e624c24bc9p-27), + (double2)(0x1.59cfb20000000p-1, 0x1.7eba1f7dd1adfp-27), + (double2)(0x1.5ad4040000000p-1, 0x1.86b3e59f65355p-26), + (double2)(0x1.5bd7d30000000p-1, 0x1.ce38e637f1b4dp-30), + (double2)(0x1.5cdb1d0000000p-1, 0x1.8d82ec919edc7p-26), + (double2)(0x1.5ddde50000000p-1, 0x1.c52648ddcfa37p-27), + (double2)(0x1.5ee02a0000000p-1, 0x1.2482ceae1ac12p-26), + (double2)(0x1.5fe1ed0000000p-1, 0x1.5a312311aba4fp-26), + (double2)(0x1.60e32f0000000p-1, 0x1.11e236329f225p-27), + (double2)(0x1.61e3ef0000000p-1, 0x1.b48c8cd2f246cp-26), + (double2)(0x1.62e42e0000000p-1, 0x1.efa39ef35793cp-25), + (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0), +) + +DECLARE_TABLE(double2, LOG_F_INV_TBL, 258, + (double2)(0x1.0000000000000p+1, 0x0.0000000000000p+0), + (double2)(0x1.fe00000000000p+0, 0x1.fe01fe01fe020p-16), + (double2)(0x1.fc00000000000p+0, 0x1.fc07f01fc07f0p-14), + (double2)(0x1.fa00000000000p+0, 0x1.1caa01fa11caap-12), + (double2)(0x1.f800000000000p+0, 0x1.f81f81f81f820p-12), + (double2)(0x1.f600000000000p+0, 0x1.8856506ddaba6p-11), + (double2)(0x1.f400000000000p+0, 0x1.196792909c560p-10), + (double2)(0x1.f200000000000p+0, 0x1.7d9108c2ad433p-10), + (double2)(0x1.f000000000000p+0, 0x1.f07c1f07c1f08p-10), + (double2)(0x1.ee00000000000p+0, 0x1.38ff08b1c03ddp-9), + (double2)(0x1.ec00000000000p+0, 0x1.80f6603d980f6p-9), + (double2)(0x1.ea00000000000p+0, 0x1.d00f57403d5d0p-9), + (double2)(0x1.e900000000000p+0, 0x1.31abf0b7672a0p-12), + (double2)(0x1.e700000000000p+0, 0x1.06a965d43919bp-10), + (double2)(0x1.e500000000000p+0, 0x1.ceb240795ceb2p-10), + (double2)(0x1.e300000000000p+0, 0x1.522f3b834e67fp-9), + (double2)(0x1.e100000000000p+0, 0x1.c3c3c3c3c3c3cp-9), + (double2)(0x1.e000000000000p+0, 0x1.e01e01e01e01ep-12), + (double2)(0x1.de00000000000p+0, 0x1.75b8fe21a291cp-10), + (double2)(0x1.dc00000000000p+0, 0x1.403b9403b9404p-9), + (double2)(0x1.da00000000000p+0, 0x1.cc0ed7303b5ccp-9), + (double2)(0x1.d900000000000p+0, 0x1.79118f3fc4da2p-11), + (double2)(0x1.d700000000000p+0, 0x1.ed952e0b0ce46p-10), + (double2)(0x1.d500000000000p+0, 0x1.95900eae56404p-9), + (double2)(0x1.d400000000000p+0, 0x1.d41d41d41d41dp-12), + (double2)(0x1.d200000000000p+0, 0x1.cb28ff16c69aep-10), + (double2)(0x1.d000000000000p+0, 0x1.96b1edd80e866p-9), + (double2)(0x1.cf00000000000p+0, 0x1.372e225fe30d9p-11), + (double2)(0x1.cd00000000000p+0, 0x1.0ad12073615a2p-9), + (double2)(0x1.cb00000000000p+0, 0x1.cdb2c0397cdb3p-9), + (double2)(0x1.ca00000000000p+0, 0x1.2cc157b864407p-10), + (double2)(0x1.c800000000000p+0, 0x1.64cb5f7148404p-9), + (double2)(0x1.c700000000000p+0, 0x1.c71c71c71c71cp-12), + (double2)(0x1.c500000000000p+0, 0x1.129a21a930b84p-9), + (double2)(0x1.c300000000000p+0, 0x1.f1e0387f1e038p-9), + (double2)(0x1.c200000000000p+0, 0x1.ad4e4ba80709bp-10), + (double2)(0x1.c000000000000p+0, 0x1.c0e070381c0e0p-9), + (double2)(0x1.bf00000000000p+0, 0x1.60fba1a362bb0p-10), + (double2)(0x1.bd00000000000p+0, 0x1.a5713280dee96p-9), + (double2)(0x1.bc00000000000p+0, 0x1.3f59620f9ece9p-10), + (double2)(0x1.ba00000000000p+0, 0x1.9f22983759f23p-9), + (double2)(0x1.b900000000000p+0, 0x1.478ac63fc8d5cp-10), + (double2)(0x1.b700000000000p+0, 0x1.ad87bb4671656p-9), + (double2)(0x1.b600000000000p+0, 0x1.78b8efbb8148cp-10), + (double2)(0x1.b400000000000p+0, 0x1.d0369d0369d03p-9), + (double2)(0x1.b300000000000p+0, 0x1.d212b601b3748p-10), + (double2)(0x1.b200000000000p+0, 0x1.b2036406c80d9p-15), + (double2)(0x1.b000000000000p+0, 0x1.29663b24547d1p-9), + (double2)(0x1.af00000000000p+0, 0x1.435e50d79435ep-11), + (double2)(0x1.ad00000000000p+0, 0x1.7d0ff2920bc03p-9), + (double2)(0x1.ac00000000000p+0, 0x1.5c06b15c06b16p-10), + (double2)(0x1.aa00000000000p+0, 0x1.e3a5f0fd7f954p-9), + (double2)(0x1.a900000000000p+0, 0x1.1dec0d4c77b03p-9), + (double2)(0x1.a800000000000p+0, 0x1.73289870ac52ep-11), + (double2)(0x1.a600000000000p+0, 0x1.a034da034da03p-9), + (double2)(0x1.a500000000000p+0, 0x1.d041da2292856p-10), + (double2)(0x1.a400000000000p+0, 0x1.a41a41a41a41ap-12), + (double2)(0x1.a200000000000p+0, 0x1.8550f8a39409dp-9), + (double2)(0x1.a100000000000p+0, 0x1.b4fe5e92c0686p-10), + (double2)(0x1.a000000000000p+0, 0x1.a01a01a01a01ap-12), + (double2)(0x1.9e00000000000p+0, 0x1.91d2a2067b23ap-9), + (double2)(0x1.9d00000000000p+0, 0x1.e7c5dada0b4e5p-10), + (double2)(0x1.9c00000000000p+0, 0x1.68a7725080ce1p-11), + (double2)(0x1.9a00000000000p+0, 0x1.c49d4aa21b490p-9), + (double2)(0x1.9900000000000p+0, 0x1.3333333333333p-9), + (double2)(0x1.9800000000000p+0, 0x1.4bc363b03fccfp-10), + (double2)(0x1.9700000000000p+0, 0x1.c9f01970e4f81p-13), + (double2)(0x1.9500000000000p+0, 0x1.97617c6ef5b25p-9), + (double2)(0x1.9400000000000p+0, 0x1.161f9add3c0cap-9), + (double2)(0x1.9300000000000p+0, 0x1.319fe6cb39806p-10), + (double2)(0x1.9200000000000p+0, 0x1.f693a1c451ab3p-13), + (double2)(0x1.9000000000000p+0, 0x1.a9e240321a9e2p-9), + (double2)(0x1.8f00000000000p+0, 0x1.3831f3831f383p-9), + (double2)(0x1.8e00000000000p+0, 0x1.949ebc4dcfc1cp-10), + (double2)(0x1.8d00000000000p+0, 0x1.80c6980c6980cp-11), + (double2)(0x1.8b00000000000p+0, 0x1.f9d00c5fe7403p-9), + (double2)(0x1.8a00000000000p+0, 0x1.9721ed7e75347p-9), + (double2)(0x1.8900000000000p+0, 0x1.381ec0313381fp-9), + (double2)(0x1.8800000000000p+0, 0x1.b97c2aec12653p-10), + (double2)(0x1.8700000000000p+0, 0x1.09ef3024ae3bap-10), + (double2)(0x1.8600000000000p+0, 0x1.8618618618618p-12), + (double2)(0x1.8400000000000p+0, 0x1.e0184f00c2780p-9), + (double2)(0x1.8300000000000p+0, 0x1.92ef5657dba52p-9), + (double2)(0x1.8200000000000p+0, 0x1.4940305494030p-9), + (double2)(0x1.8100000000000p+0, 0x1.0303030303030p-9), + (double2)(0x1.8000000000000p+0, 0x1.8060180601806p-10), + (double2)(0x1.7f00000000000p+0, 0x1.017f405fd017fp-10), + (double2)(0x1.7e00000000000p+0, 0x1.12a8ad278e8ddp-11), + (double2)(0x1.7d00000000000p+0, 0x1.7d05f417d05f4p-14), + (double2)(0x1.7b00000000000p+0, 0x1.d67245c02f7d6p-9), + (double2)(0x1.7a00000000000p+0, 0x1.a4411c1d986a9p-9), + (double2)(0x1.7900000000000p+0, 0x1.754d76c7316dfp-9), + (double2)(0x1.7800000000000p+0, 0x1.49902f149902fp-9), + (double2)(0x1.7700000000000p+0, 0x1.21023358c1a68p-9), + (double2)(0x1.7600000000000p+0, 0x1.f7390d2a6c406p-10), + (double2)(0x1.7500000000000p+0, 0x1.b2b0805d5b2b1p-10), + (double2)(0x1.7400000000000p+0, 0x1.745d1745d1746p-10), + (double2)(0x1.7300000000000p+0, 0x1.3c31507fa32c4p-10), + (double2)(0x1.7200000000000p+0, 0x1.0a1fd1b7af017p-10), + (double2)(0x1.7100000000000p+0, 0x1.bc36ce3e0453ap-11), + (double2)(0x1.7000000000000p+0, 0x1.702e05c0b8170p-11), + (double2)(0x1.6f00000000000p+0, 0x1.300b79300b793p-11), + (double2)(0x1.6e00000000000p+0, 0x1.f76b4337c6cb1p-12), + (double2)(0x1.6d00000000000p+0, 0x1.a62681c860fb0p-12), + (double2)(0x1.6c00000000000p+0, 0x1.6c16c16c16c17p-12), + (double2)(0x1.6b00000000000p+0, 0x1.490aa31a3cfc7p-12), + (double2)(0x1.6a00000000000p+0, 0x1.3cd153729043ep-12), + (double2)(0x1.6900000000000p+0, 0x1.473a88d0bfd2ep-12), + (double2)(0x1.6800000000000p+0, 0x1.6816816816817p-12), + (double2)(0x1.6700000000000p+0, 0x1.9f36016719f36p-12), + (double2)(0x1.6600000000000p+0, 0x1.ec6a5122f9016p-12), + (double2)(0x1.6500000000000p+0, 0x1.27c29da5519cfp-11), + (double2)(0x1.6400000000000p+0, 0x1.642c8590b2164p-11), + (double2)(0x1.6300000000000p+0, 0x1.ab5c45606f00bp-11), + (double2)(0x1.6200000000000p+0, 0x1.fd3b80b11fd3cp-11), + (double2)(0x1.6100000000000p+0, 0x1.2cda0c6ba4eaap-10), + (double2)(0x1.6000000000000p+0, 0x1.6058160581606p-10), + (double2)(0x1.5f00000000000p+0, 0x1.990d0a4b7ef87p-10), + (double2)(0x1.5e00000000000p+0, 0x1.d6ee340579d6fp-10), + (double2)(0x1.5d00000000000p+0, 0x1.0cf87d9c54a69p-9), + (double2)(0x1.5c00000000000p+0, 0x1.310572620ae4cp-9), + (double2)(0x1.5b00000000000p+0, 0x1.5798c8ff522a2p-9), + (double2)(0x1.5a00000000000p+0, 0x1.80ad602b580adp-9), + (double2)(0x1.5900000000000p+0, 0x1.ac3e24799546fp-9), + (double2)(0x1.5800000000000p+0, 0x1.da46102b1da46p-9), + (double2)(0x1.5800000000000p+0, 0x1.5805601580560p-14), + (double2)(0x1.5700000000000p+0, 0x1.ed3c506b39a23p-12), + (double2)(0x1.5600000000000p+0, 0x1.cbdd3e2970f60p-11), + (double2)(0x1.5500000000000p+0, 0x1.5555555555555p-10), + (double2)(0x1.5400000000000p+0, 0x1.c979aee0bf805p-10), + (double2)(0x1.5300000000000p+0, 0x1.21291e81fd58ep-9), + (double2)(0x1.5200000000000p+0, 0x1.5fead500a9580p-9), + (double2)(0x1.5100000000000p+0, 0x1.a0fd5c5f02a3ap-9), + (double2)(0x1.5000000000000p+0, 0x1.e45c223898adcp-9), + (double2)(0x1.5000000000000p+0, 0x1.5015015015015p-12), + (double2)(0x1.4f00000000000p+0, 0x1.c7b16ea64d422p-11), + (double2)(0x1.4e00000000000p+0, 0x1.7829cbc14e5e1p-10), + (double2)(0x1.4d00000000000p+0, 0x1.0877db8589720p-9), + (double2)(0x1.4c00000000000p+0, 0x1.5710e4b5edceap-9), + (double2)(0x1.4b00000000000p+0, 0x1.a7dbb4d1fc1c8p-9), + (double2)(0x1.4a00000000000p+0, 0x1.fad40a57eb503p-9), + (double2)(0x1.4a00000000000p+0, 0x1.3fd6bb00a5140p-11), + (double2)(0x1.4900000000000p+0, 0x1.4e78ecb419ba9p-10), + (double2)(0x1.4800000000000p+0, 0x1.00a44029100a4p-9), + (double2)(0x1.4700000000000p+0, 0x1.5c28f5c28f5c3p-9), + (double2)(0x1.4600000000000p+0, 0x1.b9c68b2c0cc4ap-9), + (double2)(0x1.4600000000000p+0, 0x1.978feb9f34381p-13), + (double2)(0x1.4500000000000p+0, 0x1.ecf163bb6500ap-11), + (double2)(0x1.4400000000000p+0, 0x1.be1958b67ebb9p-10), + (double2)(0x1.4300000000000p+0, 0x1.44e6157dc9a3bp-9), + (double2)(0x1.4200000000000p+0, 0x1.acc4baa3f0ddfp-9), + (double2)(0x1.4200000000000p+0, 0x1.6a4cbcb2a247bp-13), + (double2)(0x1.4100000000000p+0, 0x1.0505050505050p-10), + (double2)(0x1.4000000000000p+0, 0x1.e0b4439959819p-10), + (double2)(0x1.3f00000000000p+0, 0x1.6027f6027f602p-9), + (double2)(0x1.3e00000000000p+0, 0x1.d1e854b5e0db4p-9), + (double2)(0x1.3e00000000000p+0, 0x1.165e7254813e2p-11), + (double2)(0x1.3d00000000000p+0, 0x1.76646a9d716efp-10), + (double2)(0x1.3c00000000000p+0, 0x1.32b48f757ce88p-9), + (double2)(0x1.3b00000000000p+0, 0x1.ac1b24652a906p-9), + (double2)(0x1.3b00000000000p+0, 0x1.3b13b13b13b14p-12), + (double2)(0x1.3a00000000000p+0, 0x1.490e1eb208984p-10), + (double2)(0x1.3900000000000p+0, 0x1.2385830fec66ep-9), + (double2)(0x1.3800000000000p+0, 0x1.a45a6cc111b7ep-9), + (double2)(0x1.3800000000000p+0, 0x1.3813813813814p-12), + (double2)(0x1.3700000000000p+0, 0x1.56f472517b708p-10), + (double2)(0x1.3600000000000p+0, 0x1.31be7bc0e8f2ap-9), + (double2)(0x1.3500000000000p+0, 0x1.b9cbf3e55f044p-9), + (double2)(0x1.3500000000000p+0, 0x1.0e7d95bc609a9p-11), + (double2)(0x1.3400000000000p+0, 0x1.9e6b3804d19e7p-10), + (double2)(0x1.3300000000000p+0, 0x1.5c8b6af7963c2p-9), + (double2)(0x1.3200000000000p+0, 0x1.eb9dad43bf402p-9), + (double2)(0x1.3200000000000p+0, 0x1.f1a515885fb37p-11), + (double2)(0x1.3100000000000p+0, 0x1.0eeb1d3d76c02p-9), + (double2)(0x1.3000000000000p+0, 0x1.a320261a32026p-9), + (double2)(0x1.3000000000000p+0, 0x1.c82ac40260390p-12), + (double2)(0x1.2f00000000000p+0, 0x1.a12f684bda12fp-10), + (double2)(0x1.2e00000000000p+0, 0x1.69d43fda2962cp-9), + (double2)(0x1.2e00000000000p+0, 0x1.2e025c04b8097p-15), + (double2)(0x1.2d00000000000p+0, 0x1.42804b542804bp-10), + (double2)(0x1.2c00000000000p+0, 0x1.3f69b02593f6ap-9), + (double2)(0x1.2b00000000000p+0, 0x1.df31cb46e21fap-9), + (double2)(0x1.2b00000000000p+0, 0x1.012b404ad012bp-10), + (double2)(0x1.2a00000000000p+0, 0x1.23925e7820a7fp-9), + (double2)(0x1.2900000000000p+0, 0x1.c8253c8253c82p-9), + (double2)(0x1.2900000000000p+0, 0x1.b92ddc02526e5p-11), + (double2)(0x1.2800000000000p+0, 0x1.1602511602511p-9), + (double2)(0x1.2700000000000p+0, 0x1.bf471439c9adfp-9), + (double2)(0x1.2700000000000p+0, 0x1.a85c40939a85cp-11), + (double2)(0x1.2600000000000p+0, 0x1.166f9ac024d16p-9), + (double2)(0x1.2500000000000p+0, 0x1.c44e10125e227p-9), + (double2)(0x1.2500000000000p+0, 0x1.cebf48bbd90e5p-11), + (double2)(0x1.2400000000000p+0, 0x1.2492492492492p-9), + (double2)(0x1.2300000000000p+0, 0x1.d6f2e2ec0b673p-9), + (double2)(0x1.2300000000000p+0, 0x1.159e26af37c05p-10), + (double2)(0x1.2200000000000p+0, 0x1.4024540245402p-9), + (double2)(0x1.2100000000000p+0, 0x1.f6f0243f6f024p-9), + (double2)(0x1.2100000000000p+0, 0x1.5e60121579805p-10), + (double2)(0x1.2000000000000p+0, 0x1.68e18cf81b10fp-9), + (double2)(0x1.2000000000000p+0, 0x1.2012012012012p-12), + (double2)(0x1.1f00000000000p+0, 0x1.c11f7047dc11fp-10), + (double2)(0x1.1e00000000000p+0, 0x1.9e878ff70985ep-9), + (double2)(0x1.1e00000000000p+0, 0x1.779d9fdc3a219p-11), + (double2)(0x1.1d00000000000p+0, 0x1.1eace5c957907p-9), + (double2)(0x1.1c00000000000p+0, 0x1.e0d5b450239e1p-9), + (double2)(0x1.1c00000000000p+0, 0x1.48bf073816367p-10), + (double2)(0x1.1b00000000000p+0, 0x1.694808dda5202p-9), + (double2)(0x1.1b00000000000p+0, 0x1.7c67f2bae2b21p-12), + (double2)(0x1.1a00000000000p+0, 0x1.ee58469ee5847p-10), + (double2)(0x1.1900000000000p+0, 0x1.c0233c0233c02p-9), + (double2)(0x1.1900000000000p+0, 0x1.14e02328a7012p-10), + (double2)(0x1.1800000000000p+0, 0x1.561072057b573p-9), + (double2)(0x1.1800000000000p+0, 0x1.1811811811812p-12), + (double2)(0x1.1700000000000p+0, 0x1.e28646f5a1060p-10), + (double2)(0x1.1600000000000p+0, 0x1.c0d1284e6f1d7p-9), + (double2)(0x1.1600000000000p+0, 0x1.23543f0c80459p-10), + (double2)(0x1.1500000000000p+0, 0x1.63cbeea4e1a09p-9), + (double2)(0x1.1500000000000p+0, 0x1.b9a3fdd5c8cb8p-12), + (double2)(0x1.1400000000000p+0, 0x1.0be1c159a76d2p-9), + (double2)(0x1.1300000000000p+0, 0x1.e1d1a688e4838p-9), + (double2)(0x1.1300000000000p+0, 0x1.72044d72044d7p-10), + (double2)(0x1.1200000000000p+0, 0x1.91713db81577bp-9), + (double2)(0x1.1200000000000p+0, 0x1.ac73ae9819b50p-11), + (double2)(0x1.1100000000000p+0, 0x1.460334e904cf6p-9), + (double2)(0x1.1100000000000p+0, 0x1.1111111111111p-12), + (double2)(0x1.1000000000000p+0, 0x1.feef80441fef0p-10), + (double2)(0x1.0f00000000000p+0, 0x1.de021fde021fep-9), + (double2)(0x1.0f00000000000p+0, 0x1.7b7eacc9686a0p-10), + (double2)(0x1.0e00000000000p+0, 0x1.9ead7cd391fbcp-9), + (double2)(0x1.0e00000000000p+0, 0x1.0195609804390p-10), + (double2)(0x1.0d00000000000p+0, 0x1.641511e8d2b32p-9), + (double2)(0x1.0d00000000000p+0, 0x1.222b1acf1ce96p-11), + (double2)(0x1.0c00000000000p+0, 0x1.2e29f79b47582p-9), + (double2)(0x1.0c00000000000p+0, 0x1.4f0d1682e11cdp-13), + (double2)(0x1.0b00000000000p+0, 0x1.f9bb096771e4dp-10), + (double2)(0x1.0a00000000000p+0, 0x1.e5ee45dd96ae2p-9), + (double2)(0x1.0a00000000000p+0, 0x1.a0429a0429a04p-10), + (double2)(0x1.0900000000000p+0, 0x1.bb74d5f06c021p-9), + (double2)(0x1.0900000000000p+0, 0x1.4fce404254fcep-10), + (double2)(0x1.0800000000000p+0, 0x1.95766eacbc402p-9), + (double2)(0x1.0800000000000p+0, 0x1.0842108421084p-10), + (double2)(0x1.0700000000000p+0, 0x1.73e5371d5c338p-9), + (double2)(0x1.0700000000000p+0, 0x1.930523fbe3368p-11), + (double2)(0x1.0600000000000p+0, 0x1.56b38f225f6c4p-9), + (double2)(0x1.0600000000000p+0, 0x1.26e978d4fdf3bp-11), + (double2)(0x1.0500000000000p+0, 0x1.3dd40e4eb0cc6p-9), + (double2)(0x1.0500000000000p+0, 0x1.97f7d73404146p-12), + (double2)(0x1.0400000000000p+0, 0x1.293982cc98af1p-9), + (double2)(0x1.0400000000000p+0, 0x1.0410410410410p-12), + (double2)(0x1.0300000000000p+0, 0x1.18d6f048ff7e4p-9), + (double2)(0x1.0300000000000p+0, 0x1.236a3ebc349dep-13), + (double2)(0x1.0200000000000p+0, 0x1.0c9f8ee53d18cp-9), + (double2)(0x1.0200000000000p+0, 0x1.0204081020408p-14), + (double2)(0x1.0100000000000p+0, 0x1.0486ca2f46ea6p-9), + (double2)(0x1.0100000000000p+0, 0x1.0101010101010p-16), + (double2)(0x1.0000000000000p+0, 0x1.0080402010080p-9), + (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), +) +
diff --git a/amd-builtins/math64/pownD.cl b/amd-builtins/math64/pownD.cl new file mode 100644 index 0000000..83c2762 --- /dev/null +++ b/amd-builtins/math64/pownD.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_POWN +#include "powD_base.h" +
diff --git a/amd-builtins/math64/powrD.cl b/amd-builtins/math64/powrD.cl new file mode 100644 index 0000000..a02e929 --- /dev/null +++ b/amd-builtins/math64/powrD.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_POWR +#include "powD_base.h" +
diff --git a/amd-builtins/math64/remainderD.cl b/amd-builtins/math64/remainderD.cl new file mode 100644 index 0000000..a15873c --- /dev/null +++ b/amd-builtins/math64/remainderD.cl
@@ -0,0 +1,27 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +#define COMPILING_REMAINDER +#include "remainderD.h" +
diff --git a/amd-builtins/math64/remainderD.h b/amd-builtins/math64/remainderD.h new file mode 100644 index 0000000..95cea6f --- /dev/null +++ b/amd-builtins/math64/remainderD.h
@@ -0,0 +1,203 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +static inline double +my_ldexp(double x, int n) +{ + // XXX Have to go twice here because the hardware can't handle the full range (yet) + int nh = n >> 1; + return ldexp(ldexp(x, nh), n-nh); +} + +#if defined(COMPILING_FMOD) +__attribute__((overloadable, always_inline, weak)) double +fmod(double x, double y) +#elif defined(COMPILING_REMQUO) +__attribute__((overloadable, always_inline, weak)) double +remquo(double x, double y, int *pquo) +#else +__attribute__((overloadable, always_inline, weak)) double +remainder(double x, double y) +#endif +{ + ulong ux = as_ulong(x); + ulong ax = ux & ~SIGNBIT_DP64; + ulong xsgn = ux ^ ax; + double dx = as_double(ax); + int xexp = convert_int(ax >> EXPSHIFTBITS_DP64); + int xexp1 = 11 - (int) clz(ax & MANTBITS_DP64); + xexp1 = xexp < 1 ? xexp1 : xexp; + + ulong uy = as_ulong(y); + ulong ay = uy & ~SIGNBIT_DP64; + double dy = as_double(ay); + int yexp = convert_int(ay >> EXPSHIFTBITS_DP64); + int yexp1 = 11 - (int) clz(ay & MANTBITS_DP64); + yexp1 = yexp < 1 ? yexp1 : yexp; + +#if !defined COMPILING_FMOD + int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1; +#endif + + // First assume |x| > |y| + + // Set ntimes to the number of times we need to do a + // partial remainder. If the exponent of x is an exact multiple + // of 53 larger than the exponent of y, and the mantissa of x is + // less than the mantissa of y, ntimes will be one too large + // but it doesn't matter - it just means that we'll go round + // the loop below one extra time. + int ntimes = max(0, (xexp1 - yexp1) / 53); + double w = my_ldexp(dy, ntimes * 53); + w = ntimes == 0 ? dy : w; + double scale = ntimes == 0 ? 1.0 : 0x1.0p-53; + + // Each time round the loop we compute a partial remainder. + // This is done by subtracting a large multiple of w + // from x each time, where w is a scaled up version of y. + // The subtraction must be performed exactly in quad + // precision, though the result at each stage can + // fit exactly in a double precision number. + int i; + double t, v, p, pp; + + for (i = 0; i < ntimes; i++) { + // Compute integral multiplier + t = trunc(dx / w); + + // Compute w * t in quad precision + p = w * t; + pp = fma(w, t, -p); + + // Subtract w * t from dx + v = dx - p; + dx = v + (((dx - v) - p) - pp); + + // If t was one too large, dx will be negative. Add back one w. + dx += dx < 0.0 ? w : 0.0; + + // Scale w down by 2^(-53) for the next iteration + w *= scale; + } + + // One more time + // Variable todd says whether the integer t is odd or not + t = floor(dx / w); + long lt = (long)t; + int todd = lt & 1; + + p = w * t; + pp = fma(w, t, -p); + v = dx - p; + dx = v + (((dx - v) - p) - pp); + i = dx < 0.0; + todd ^= i; + dx += i ? w : 0.0; + +#if defined(COMPILING_REMQUO) + lt -= i; +#endif + + // At this point, dx lies in the range [0,dy) + +#if !defined(COMPILING_FMOD) + // For the fmod function, we're done apart from setting the correct sign. + // + // For the remainder function, we need to adjust dx + // so that it lies in the range (-y/2, y/2] by carefully + // subtracting w (== dy == y) if necessary. The rigmarole + // with todd is to get the correct sign of the result + // when x/y lies exactly half way between two integers, + // when we need to choose the even integer. + + int al = (2.0*dx > w) | (todd & (2.0*dx == w)); + double dxl = dx - (al ? w : 0.0); + + int ag = (dx > 0.5*w) | (todd & (dx == 0.5*w)); + double dxg = dx - (ag ? w : 0.0); + + dx = dy < 0x1.0p+1022 ? dxl : dxg; +# if defined COMPILING_REMQUO + lt += dy < 0x1.0p+1022 ? al : ag; + int quo = ((int)lt & 0x7f) * qsgn; +# endif +#endif + + double ret = as_double(xsgn ^ as_ulong(dx)); + dx = as_double(ax); + + // Now handle |x| == |y| + int c = dx == dy; + t = as_double(xsgn); +#if defined COMPILING_REMQUO + quo = c ? qsgn : quo; +#endif + ret = c ? t : ret; + + // Next, handle |x| < |y| + c = dx < dy; +#if defined COMPILING_REMQUO + quo = c ? 0 : quo; +#endif + ret = c ? x : ret; + +#if !defined COMPILING_FMOD + c &= (yexp < 1023 & 2.0*dx > dy) | (dx > 0.5*dy); +# if defined COMPILING_REMQUO + quo = c ? qsgn : quo; +# endif + // we could use a conversion here instead since qsgn = +-1 + p = qsgn == 1 ? -1.0 : 1.0; + t = fma(y, p, x); + ret = c ? t : ret; +#endif + + // We don't need anything special for |x| == 0 + + // |y| is 0 + c = dy == 0.0; +#if defined COMPILING_REMQUO + quo = c ? 0 : quo; +#endif + ret = c ? as_double(QNANBITPATT_DP64) : ret; + + // y is +-Inf, NaN + c = yexp > BIASEDEMAX_DP64; +#if defined COMPILING_REMQUO + quo = c ? 0 : quo; +#endif + t = y == y ? x : y; + ret = c ? t : ret; + + // x is +=Inf, NaN + c = xexp > BIASEDEMAX_DP64; +#if defined COMPILING_REMQUO + quo = c ? 0 : quo; +#endif + ret = c ? as_double(QNANBITPATT_DP64) : ret; + +#if defined COMPILING_REMQUO + *pquo = quo; +#endif + return ret; +} +
diff --git a/amd-builtins/math64/remainderD_piby2.h b/amd-builtins/math64/remainderD_piby2.h new file mode 100644 index 0000000..ccf7937 --- /dev/null +++ b/amd-builtins/math64/remainderD_piby2.h
@@ -0,0 +1,215 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// Reduction for medium sized arguments +static inline void +remainder_piby2_medium(double x, double *r, double *rr, int *regn) +{ + // How many pi/2 is x a multiple of? + const double two_by_pi = 0x1.45f306dc9c883p-1; + double dnpi2 = trunc(fma(x, two_by_pi, 0.5)); + + const double piby2_h = -7074237752028440.0 / 0x1.0p+52; + const double piby2_m = -2483878800010755.0 / 0x1.0p+105; + const double piby2_t = -3956492004828932.0 / 0x1.0p+158; + + // Compute product of npi2 with 159 bits of 2/pi + double p_hh = piby2_h * dnpi2; + double p_ht = fma(piby2_h, dnpi2, -p_hh); + double p_mh = piby2_m * dnpi2; + double p_mt = fma(piby2_m, dnpi2, -p_mh); + double p_th = piby2_t * dnpi2; + double p_tt = fma(piby2_t, dnpi2, -p_th); + + // Reduce to 159 bits + double ph = p_hh; + double pm = p_ht + p_mh; + double t = p_mh - (pm - p_ht); + double pt = p_th + t + p_mt + p_tt; + t = ph + pm; pm = pm - (t - ph); ph = t; + t = pm + pt; pt = pt - (t - pm); pm = t; + + // Subtract from x + t = x + ph; + double qh = t + pm; + double qt = pm - (qh - t) + pt; + + *r = qh; + *rr = qt; + *regn = (int)(long)dnpi2 & 0x3; +} + +// Given positive argument x, reduce it to the range [-pi/4,pi/4] using +// extra precision, and return the result in r, rr. +// Return value "regn" tells how many lots of pi/2 were subtracted +// from x to put it in the range [-pi/4,pi/4], mod 4. + +// For bytealign +#pragma OPENCL EXTENSION cl_amd_media_ops : enable + +static inline void +remainder_piby2_large(double x, double *r, double *rr, int *regn) +{ + USE_TABLE(uchar, pibits, PIBITS); + + long ux = as_long(x); + int e = (int)(ux >> 52) - 1023; + int i = max(23, (e >> 3) + 17); + int j = 150 - i; + int j16 = j & ~0xf; + + // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary + uint4 q0 = *(__constant uint4 *)(pibits + j16); + uint4 q1 = *(__constant uint4 *)(pibits + j16 + 16); + uint4 q2 = *(__constant uint4 *)(pibits + j16 + 32); + + int k = (j >> 2) & 0x3; + int4 c = (int4)k == (int4)(0, 1, 2, 3); + + uint u0, u1, u2, u3, u4, u5, u6; + + u0 = c.s1 ? q0.s1 : q0.s0; + u0 = c.s2 ? q0.s2 : u0; + u0 = c.s3 ? q0.s3 : u0; + + u1 = c.s1 ? q0.s2 : q0.s1; + u1 = c.s2 ? q0.s3 : u1; + u1 = c.s3 ? q1.s0 : u1; + + u2 = c.s1 ? q0.s3 : q0.s2; + u2 = c.s2 ? q1.s0 : u2; + u2 = c.s3 ? q1.s1 : u2; + + u3 = c.s1 ? q1.s0 : q0.s3; + u3 = c.s2 ? q1.s1 : u3; + u3 = c.s3 ? q1.s2 : u3; + + u4 = c.s1 ? q1.s1 : q1.s0; + u4 = c.s2 ? q1.s2 : u4; + u4 = c.s3 ? q1.s3 : u4; + + u5 = c.s1 ? q1.s2 : q1.s1; + u5 = c.s2 ? q1.s3 : u5; + u5 = c.s3 ? q2.s0 : u5; + + u6 = c.s1 ? q1.s3 : q1.s2; + u6 = c.s2 ? q2.s0 : u6; + u6 = c.s3 ? q2.s1 : u6; + + uint v0 = amd_bytealign(u1, u0, j); + uint v1 = amd_bytealign(u2, u1, j); + uint v2 = amd_bytealign(u3, u2, j); + uint v3 = amd_bytealign(u4, u3, j); + uint v4 = amd_bytealign(u5, u4, j); + uint v5 = amd_bytealign(u6, u5, j); + + // Place those 192 bits in 4 48-bit doubles along with correct exponent + // If i > 1018 we would get subnormals so we scale p up and x down to get the same product + i = 2 + 8*i; + x *= i > 1018 ? 0x1.0p-136 : 1.0; + i -= i > 1018 ? 136 : 0; + + uint ua = (uint)(1023 + 52 - i) << 20; + double a = as_double((uint2)(0, ua)); + double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a; + ua += 0x03000000U; + a = as_double((uint2)(0, ua)); + double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a; + ua += 0x03000000U; + a = as_double((uint2)(0, ua)); + double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a; + ua += 0x03000000U; + a = as_double((uint2)(0, ua)); + double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a; + + // Exact multiply + double f0h = p0 * x; + double f0l = fma(p0, x, -f0h); + double f1h = p1 * x; + double f1l = fma(p1, x, -f1h); + double f2h = p2 * x; + double f2l = fma(p2, x, -f2h); + double f3h = p3 * x; + double f3l = fma(p3, x, -f3h); + + // Accumulate product into 4 doubles + double s, t; + + double f3 = f3h + f2h; + t = f2h - (f3 - f3h); + s = f3l + t; + t = t - (s - f3l); + + double f2 = s + f1h; + t = f1h - (f2 - s) + t; + s = f2l + t; + t = t - (s - f2l); + + double f1 = s + f0h; + t = f0h - (f1 - s) + t; + s = f1l + t; + + double f0 = s + f0l; + + // Strip off unwanted large integer bits + f3 = 0x1.0p+10 * __amdil_fraction_f64(f3 * 0x1.0p-10); + f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0; + +#undef EXTRA_ACCURACY +#if defined EXTRA_ACCURACY + // Shift out large integer bits. This adds about 20 bits to the accuracy of "rr" + s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t; + s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t; + s = f1 + f0; t = f0 - (s - f1); f1 = s; f0 = t; +#endif + + // Compute least significant integer bits + t = f3 + f2; + double di = t - __amdil_fraction_f64(t); + i = (float)di; + + // Shift out remaining integer part + f3 -= di; + s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t; + s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t; + f1 += f0; + + // Subtract 1 if fraction is >= 0.5, and update regn + int g = f3 >= 0.5; + i += g; + f3 -= (float)g; + + // Shift up bits + s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1; + + // Multiply precise fraction by pi/2 to get radians + const double p2h = 7074237752028440.0 / 0x1.0p+52; + const double p2t = 4967757600021510.0 / 0x1.0p+106; + + double rhi = f3 * p2h; + double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi))); + + *r = rhi + rlo; + *rr = rlo - (*r - rhi); + *regn = i & 0x3; +} +
diff --git a/amd-builtins/math64/remquoD.cl b/amd-builtins/math64/remquoD.cl new file mode 100644 index 0000000..f296549 --- /dev/null +++ b/amd-builtins/math64/remquoD.cl
@@ -0,0 +1,46 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +#define COMPILING_REMQUO +#include "remainderD.h" + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline)) double +remquo(double x, double y, __global int *quo) +{ + int q; + double r = remquo(x, y, &q); + *quo = q; + return r; +} + +__attribute__((overloadable, always_inline)) double +remquo(double x, double y, __local int *quo) +{ + int q; + double r = remquo(x, y, &q); + *quo = q; + return r; +} +#endif
diff --git a/amd-builtins/math64/rintD.cl b/amd-builtins/math64/rintD.cl new file mode 100644 index 0000000..00e3ef6 --- /dev/null +++ b/amd-builtins/math64/rintD.cl
@@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__ ((overloadable, always_inline)) double +rint(double x) +{ + return __amdil_round_nearest_f64(x); +} +
diff --git a/amd-builtins/math64/rootnD.cl b/amd-builtins/math64/rootnD.cl new file mode 100644 index 0000000..06840dc --- /dev/null +++ b/amd-builtins/math64/rootnD.cl
@@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define COMPILING_ROOTN +#include "powD_base.h" +
diff --git a/amd-builtins/math64/roundD.cl b/amd-builtins/math64/roundD.cl new file mode 100644 index 0000000..2aa0827 --- /dev/null +++ b/amd-builtins/math64/roundD.cl
@@ -0,0 +1,39 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__ ((overloadable, always_inline)) double +round(double x) +{ + long l = as_long(x); + int e = ((int)(l >> 52) & 0x7ff) - 1023; + long s = (l & 0x8000000000000000L) | (e == -1 ? 0x3ff0000000000000L : 0L); + long m = 0x000fffffffffffffL >> e; + long d = 0x0008000000000000L >> e; + long k = l + (l & m ? d : 0); + k &= ~m; + k = e < 0 ? s : k; + k = e > 51 ? l : k; + return as_double(k); +} +
diff --git a/amd-builtins/math64/rsqrtD.cl b/amd-builtins/math64/rsqrtD.cl new file mode 100644 index 0000000..3f6f346 --- /dev/null +++ b/amd-builtins/math64/rsqrtD.cl
@@ -0,0 +1,33 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable, always_inline)) double +rsqrt(double x) +{ + double y0 = __amdil_rsq_f64(x); + double y1 = 0.5 * y0 * fma(-x*y0, y0, 3.0); + double y2 = 0.5 * y1 * fma(-x*y1, y1, 3.0); + return y0 > 0.0 & y0 <= 0x1.fffffffffffffp+1023 ? y2 : y0; +} +
diff --git a/amd-builtins/math64/sinD.cl b/amd-builtins/math64/sinD.cl new file mode 100644 index 0000000..e4705de --- /dev/null +++ b/amd-builtins/math64/sinD.cl
@@ -0,0 +1,47 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" +#include "sincosD_piby4.h" +#include "remainderD_piby2.h" + +__attribute__((overloadable, always_inline, pure, weak)) double +sin(double x) +{ + double y = fabs(x); + + double r, rr; + int regn; + + if (y < 0x1.0p+47) + remainder_piby2_medium(y, &r, &rr, ®n); + else + remainder_piby2_large(y, &r, &rr, ®n); + + double2 sc = sincos_piby4(r, rr); + + int2 s = as_int2(regn & 1 ? sc.hi : sc.lo); + s.hi ^= ((regn > 1) << 31) ^ ((x < 0.0) << 31); + + return isinf(x) | isnan(x) ? as_double(QNANBITPATT_DP64) : as_double(s); +} +
diff --git a/amd-builtins/math64/sincosD.cl b/amd-builtins/math64/sincosD.cl new file mode 100644 index 0000000..daf4102 --- /dev/null +++ b/amd-builtins/math64/sincosD.cl
@@ -0,0 +1,75 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" +#include "sincosD_piby4.h" +#include "remainderD_piby2.h" + +__attribute__((overloadable, always_inline)) double +sincos(double x, double * cp) +{ + double y = fabs(x); + + double r, rr; + int regn; + + if (y < 0x1.0p+47) + remainder_piby2_medium(y, &r, &rr, ®n); + else + remainder_piby2_large(y, &r, &rr, ®n); + + double2 sc = sincos_piby4(r, rr); + + int flip = (regn > 1) << 31; + int2 s = as_int2(regn & 1 ? sc.hi : sc.lo); + s.hi ^= flip ^ ((x < 0.0) << 31); + sc.lo = -sc.lo; + int2 c = as_int2(regn & 1 ? sc.lo : sc.hi); + c.hi ^= flip; + + int xgeinf = isnan(x) | isinf(x); + s = xgeinf ? as_int2(QNANBITPATT_DP64) : s; + c = xgeinf ? as_int2(QNANBITPATT_DP64) : c; + + *cp = as_double(c); + return as_double(s); +} + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline)) double +sincos(double x, __global double * cp) +{ + double c; + double s = sincos(x, &c); + *cp = c; + return s; +} + +__attribute__((overloadable, always_inline)) double +sincos(double x, __local double * cp) +{ + double c; + double s = sincos(x, &c); + *cp = c; + return s; +} +#endif
diff --git a/amd-builtins/math64/sincosD_piby4.h b/amd-builtins/math64/sincosD_piby4.h new file mode 100644 index 0000000..384a5f7 --- /dev/null +++ b/amd-builtins/math64/sincosD_piby4.h
@@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +static inline double2 +sincos_piby4(double x, double xx) +{ + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we add a correction + // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx) + // is an approximation to cos(x)*sin(xx) valid because + // xx is tiny relative to x. + + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we subtract a correction + // term g(x,xx) = x*xx to the result, where g(x,xx) + // is an approximation to sin(x)*sin(xx) valid because + // xx is tiny relative to x. + + const double sc1 = -0.166666666666666646259241729; + const double sc2 = 0.833333333333095043065222816e-2; + const double sc3 = -0.19841269836761125688538679e-3; + const double sc4 = 0.275573161037288022676895908448e-5; + const double sc5 = -0.25051132068021699772257377197e-7; + const double sc6 = 0.159181443044859136852668200e-9; + + const double cc1 = 0.41666666666666665390037e-1; + const double cc2 = -0.13888888888887398280412e-2; + const double cc3 = 0.248015872987670414957399e-4; + const double cc4 = -0.275573172723441909470836e-6; + const double cc5 = 0.208761463822329611076335e-8; + const double cc6 = -0.113826398067944859590880e-10; + + double x2 = x * x; + double x3 = x2 * x; + double r = 0.5 * x2; + double t = 1.0 - r; + + double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2); + + double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1), + x2*x2, fma(x, xx, (1.0 - t) - r)); + + double2 ret; + ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx)); + ret.hi = cp; + + return ret; +} +
diff --git a/amd-builtins/math64/sinhD.cl b/amd-builtins/math64/sinhD.cl new file mode 100644 index 0000000..11098fc --- /dev/null +++ b/amd-builtins/math64/sinhD.cl
@@ -0,0 +1,110 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +sinh(double x) +{ + USE_TABLE(double2, sinh_tbl, SINH_TBL); + USE_TABLE(double2, cosh_tbl, COSH_TBL); + + // After dealing with special cases the computation is split into + // regions as follows: + // + // abs(x) >= max_sinh_arg: + // sinh(x) = sign(x)*Inf + // + // abs(x) >= small_threshold: + // sinh(x) = sign(x)*exp(abs(x))/2 computed using the + // splitexp and scaleDouble functions as for exp_amd(). + // + // abs(x) < small_threshold: + // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + // sinh(x) is then sign(x)*z. + + const double max_sinh_arg = 7.10475860073943977113e+02; // 0x408633ce8fb9f87e + + // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27) + const double small_threshold = 0x1.2b708872320e2p+4; + + double y = fabs(x); + + // In this range we find the integer part y0 of y + // and the increment dy = y - y0. We then compute + // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) + // where sinh(y0) and cosh(y0) are obtained from tables + + int ind = min((int)y, 36); + double dy = y - ind; + double dy2 = dy * dy; + + double sdy = dy * dy2 * + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, 0.7746188980094184251527126e-12, 0.160576793121939886190847e-9), + 0.250521176994133472333666e-7), + 0.275573191913636406057211e-5), + 0.198412698413242405162014e-3), + 0.833333333333329931873097e-2), + 0.166666666666666667013899e0); + + double cdy = dy2 * fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, 0.1163921388172173692062032e-10, 0.208744349831471353536305e-8), + 0.275573350756016588011357e-6), + 0.248015872460622433115785e-4), + 0.138888888889814854814536e-2), + 0.416666666666660876512776e-1), + 0.500000000000000005911074e0); + + // At this point sinh(dy) is approximated by dy + sdy. + // Shift some significant bits from dy to sdy. + double sdy1 = as_double(as_ulong(dy) & 0xfffffffff8000000UL); + double sdy2 = sdy + (dy - sdy1); + + double2 tv = cosh_tbl[ind]; + double cl = tv.s0; + double ct = tv.s1; + tv = sinh_tbl[ind]; + double sl = tv.s0; + double st = tv.s1; + + double z = fma(cl, sdy1, fma(sl, cdy, fma(cl, sdy2, fma(ct, sdy1, fma(st, cdy, ct*sdy2)) + st))) + sl; + + // Other cases + z = y < 0x1.0p-28 | isnan(x) | isinf(x) ? y : z; + + double t = exp(y - 0x1.62e42fefa3800p-1); + t = fma(t, -0x1.ef35793c76641p-45, t); + z = y >= small_threshold ? t : z; + z = y >= max_sinh_arg ? as_double(PINFBITPATT_DP64) : z; + + return copysign(z, x); +} +
diff --git a/amd-builtins/math64/sinhcoshD_table.h b/amd-builtins/math64/sinhcoshD_table.h new file mode 100644 index 0000000..1db0e89 --- /dev/null +++ b/amd-builtins/math64/sinhcoshD_table.h
@@ -0,0 +1,105 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// Lead and tail tabulated values of sinh(i) and cosh(i) +// for i = 0,...,36. The lead part has 26 leading bits. + +DECLARE_TABLE(double2, SINH_TBL, 37, + (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.2cd9fc0000000p+0, 0x1.13ae6096a0092p-26), + (double2)(0x1.d03cf60000000p+1, 0x1.db70cfb79a640p-26), + (double2)(0x1.40926e0000000p+3, 0x1.c2526b66dc067p-23), + (double2)(0x1.b4a3800000000p+4, 0x1.b81b18647f380p-23), + (double2)(0x1.28d0160000000p+6, 0x1.bc1cdd1e1eb08p-20), + (double2)(0x1.936d228000000p+7, 0x1.d9f201534fb09p-19), + (double2)(0x1.1228768000000p+9, 0x1.d1c064a4e9954p-18), + (double2)(0x1.749ea50000000p+10, 0x1.4eca65d06ea74p-18), + (double2)(0x1.fa71570000000p+11, 0x1.0c259bcc0ecc5p-15), + (double2)(0x1.5829dc8000000p+13, 0x1.b5a6647cf9016p-13), + (double2)(0x1.d3c4488000000p+14, 0x1.9691adefb0870p-15), + (double2)(0x1.3de1650000000p+16, 0x1.3410fc29cde38p-10), + (double2)(0x1.b00b590000000p+17, 0x1.6a31a50b6fb3cp-11), + (double2)(0x1.259ac48000000p+19, 0x1.7defc71805c40p-10), + (double2)(0x1.8f0cca8000000p+20, 0x1.eb49fd80e0babp-6), + (double2)(0x1.0f2ebd0000000p+22, 0x1.4fffc7bcd5920p-7), + (double2)(0x1.7093488000000p+23, 0x1.03a93b6c63435p-3), + (double2)(0x1.f4f2208000000p+24, 0x1.1940bb255fd1cp-4), + (double2)(0x1.546d8f8000000p+26, 0x1.ed26e14260b50p-2), + (double2)(0x1.ceb0888000000p+27, 0x1.b47401fc9f2a2p+0), + (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55634f1p+3), + (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff8194ddcp+2), + (double2)(0x1.226af30000000p+32, 0x1.d8fee052ba63ap+5), + (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde3f6p+7), + (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557d1ap+8), + (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9dc4p+8), + (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abap+11), + (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13), + (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6363p+14), + (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15), + (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17), + (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19), + (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19), + (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22), + (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21), + (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22), +) + +DECLARE_TABLE(double2, COSH_TBL, 37, + (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.8b07550000000p+0, 0x1.d9f5504c2bd28p-28), + (double2)(0x1.e18fa08000000p+1, 0x1.7cb66f0a4c9fdp-25), + (double2)(0x1.422a490000000p+3, 0x1.f58617928e588p-23), + (double2)(0x1.b4ee858000000p+4, 0x1.bc7d000c38d48p-25), + (double2)(0x1.28d6fc8000000p+6, 0x1.f7f9d4e329998p-21), + (double2)(0x1.936e678000000p+7, 0x1.6e6e464885269p-19), + (double2)(0x1.1228948000000p+9, 0x1.ba3a8b946c154p-19), + (double2)(0x1.749eaa8000000p+10, 0x1.3f4e76110d5a4p-18), + (double2)(0x1.fa71580000000p+11, 0x1.17622515a3e2bp-15), + (double2)(0x1.5829dd0000000p+13, 0x1.4dc4b528af3d0p-17), + (double2)(0x1.d3c4488000000p+14, 0x1.1156278615e10p-14), + (double2)(0x1.3de1650000000p+16, 0x1.35ad50ed821f5p-10), + (double2)(0x1.b00b590000000p+17, 0x1.6b61055f2935cp-11), + (double2)(0x1.259ac48000000p+19, 0x1.7e2794a601240p-10), + (double2)(0x1.8f0cca8000000p+20, 0x1.eb4b45f6aadd3p-6), + (double2)(0x1.0f2ebd0000000p+22, 0x1.5000b967b3698p-7), + (double2)(0x1.7093488000000p+23, 0x1.03a940fadc092p-3), + (double2)(0x1.f4f2208000000p+24, 0x1.1940bf3bf874cp-4), + (double2)(0x1.546d8f8000000p+26, 0x1.ed26e1a2a2110p-2), + (double2)(0x1.ceb0888000000p+27, 0x1.b4740205796d6p+0), + (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55cb85dp+3), + (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff81e18acp+2), + (double2)(0x1.226af30000000p+32, 0x1.d8fee052bdea4p+5), + (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde926p+7), + (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557e0ep+8), + (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9e1cp+8), + (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abep+11), + (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13), + (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6364p+14), + (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15), + (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17), + (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19), + (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19), + (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22), + (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21), + (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22), +) +
diff --git a/amd-builtins/math64/sinpiD.cl b/amd-builtins/math64/sinpiD.cl new file mode 100644 index 0000000..a935a9c --- /dev/null +++ b/amd-builtins/math64/sinpiD.cl
@@ -0,0 +1,75 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" +#include "sincosD_piby4.h" + +__attribute__((overloadable)) double +sinpi(double x) +{ + const double pi = 3.1415926535897932384626433832795; + + long ix = as_long(x); + long xsgn = ix & 0x8000000000000000L; + ix ^= xsgn; + double ax = as_double(ix); + long iax = (long)ax; + double r = ax - (double)iax; + long xodd = xsgn ^ (iax & 0x1L ? 0x8000000000000000L : 0L); + + // Initialize with return for +-Inf and NaN + long ir = 0x7ff8000000000000L; + + // 2^23 <= |x| < Inf, the result is always integer + ir = ix < 0x7ff0000000000000 ? xsgn : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + double a = 1.0 - r; + int e = 0; + + // r <= 0.75 + int c = r <= 0.75; + double t = r - 0.5; + a = c ? t : a; + e = c ? 1 : e; + + // r < 0.5 + c = r < 0.5; + t = 0.5 - r; + a = c ? t : a; + + // r <= 0.25 + c = r <= 0.25; + a = c ? r : a; + e = c ? 0 : e; + + double api = a * pi; + double2 sc = sincos_piby4(api, 0.0); + long jr = xodd ^ as_long(e ? sc.hi : sc.lo); + + ir = ax < 0x1.0p+52 ? jr : ir; + + return as_double(ir); +} +
diff --git a/amd-builtins/math64/sqrtD.cl b/amd-builtins/math64/sqrtD.cl new file mode 100644 index 0000000..9d7efb5 --- /dev/null +++ b/amd-builtins/math64/sqrtD.cl
@@ -0,0 +1,32 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +extern __attribute__((pure)) double __hsail_sqrt_f64(double); + +__attribute__((overloadable, always_inline, weak)) double +sqrt(double x) +{ + /* Use sqrt_f64 because our nsqrt_f64 does not have necessary precision. */ + return __hsail_sqrt_f64(x); +}
diff --git a/amd-builtins/math64/tables64.cl b/amd-builtins/math64/tables64.cl new file mode 100644 index 0000000..4cafec2 --- /dev/null +++ b/amd-builtins/math64/tables64.cl
@@ -0,0 +1,38 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +#include "pibits64.h" + +#include "expD_table.h" + +#include "cbrtD_table.h" + +#include "logD_table.h" + +#include "powD_table.h" + +#include "sinhcoshD_table.h" + +#include "atan2D_table.h" +
diff --git a/amd-builtins/math64/tanD.cl b/amd-builtins/math64/tanD.cl new file mode 100644 index 0000000..59c0742 --- /dev/null +++ b/amd-builtins/math64/tanD.cl
@@ -0,0 +1,47 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" +#include "tanD_piby4.h" +#include "remainderD_piby2.h" + +__attribute__((overloadable)) double +tan(double x) +{ + double y = fabs(x); + + double r, rr; + int regn; + + if (y < 0x1.0p+30) + remainder_piby2_medium(y, &r, &rr, ®n); + else + remainder_piby2_large(y, &r, &rr, ®n); + + double2 tt = tan_piby4(r, rr); + + int2 t = as_int2(regn & 1 ? tt.y : tt.x); + t.hi ^= (x < 0.0) << 31; + + return __amdil_class_f64(x, SNAN|QNAN|PINF|NINF) ? as_double(QNANBITPATT_DP64) : as_double(t); +} +
diff --git a/amd-builtins/math64/tanD_piby4.h b/amd-builtins/math64/tanD_piby4.h new file mode 100644 index 0000000..011b165 --- /dev/null +++ b/amd-builtins/math64/tanD_piby4.h
@@ -0,0 +1,79 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// tan(x + xx) approximation valid on the interval [-pi/4,pi/4]. +// Also return -1/tan(x + xx) in .y +static inline double2 +tan_piby4(double x, double xx) +{ + const double piby4_lead = 7.85398163397448278999e-01; // 0x3fe921fb54442d18 + const double piby4_tail = 3.06161699786838240164e-17; // 0x3c81a62633145c06 + + // In order to maintain relative precision transform using the identity: + // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4. + // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4. + + int ca = x > 0.68; + int cb = x < -0.68; + double transform = ca ? 1.0 : 0.0; + transform = cb ? -1.0 : transform; + + double tx = fma(-transform, x, piby4_lead) + fma(-transform, xx, piby4_tail); + int c = ca | cb; + x = c ? tx : x; + xx = c ? 0.0 : xx; + + // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68]. + double t1 = x; + double r = fma(2.0, x*xx, x*x); + + double a = fma(r, + fma(r, 0.224044448537022097264602535574e-3, -0.229345080057565662883358588111e-1), + 0.372379159759792203640806338901e0); + + double b = fma(r, + fma(r, + fma(r, -0.232371494088563558304549252913e-3, 0.260656620398645407524064091208e-1), + -0.515658515729031149329237816945e0), + 0.111713747927937668539901657944e1); + + double t2 = fma(MATH_DIVIDE(a, b), x*r, xx); + + double tp = t1 + t2; + + // Compute -1.0/(t1 + t2) accurately + double z1 = as_double(as_long(tp) & 0xffffffff00000000L); + double z2 = t2 - (z1 - t1); + double trec = -MATH_RECIP(tp); + double trec_top = as_double(as_long(trec) & 0xffffffff00000000L); + + double tpr = fma(fma(trec_top, z2, fma(trec_top, z1, 1.0)), trec, trec_top); + + double tpt = transform * (1.0 - MATH_DIVIDE(2.0*tp, 1.0 + tp)); + double tptr = transform * (MATH_DIVIDE(2.0*tp, tp - 1.0) - 1.0); + + double2 ret; + ret.lo = c ? tpt : tp; + ret.hi = c ? tptr : tpr; + return ret; +} +
diff --git a/amd-builtins/math64/tanhD.cl b/amd-builtins/math64/tanhD.cl new file mode 100644 index 0000000..91dc1a8 --- /dev/null +++ b/amd-builtins/math64/tanhD.cl
@@ -0,0 +1,88 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +tanh(double x) +{ + // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent + // to the following three formulae: + // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) + // 2. (1 - (2/(exp(2*x) + 1 ))) + // 3. (exp(2*x) - 1)/(exp(2*x) + 1) + // but computationally, some formulae are better on some ranges. + + // The point at which e^-x is insignificant compared to e^x = ln(2^27) + const double large_threshold = 0x1.2b708872320e2p+4; + + ulong ux = as_ulong(x); + ulong ax = ux & ~SIGNBIT_DP64; + ulong sx = ux ^ ax; + double y = as_double(ax); + double y2 = y * y; + + // y < 0.9 + double znl = fma(y2, + fma(y2, + fma(y2, -0.142077926378834722618091e-7, -0.200047621071909498730453e-3), + -0.176016349003044679402273e-1), + -0.274030424656179760118928e0); + + double zdl = fma(y2, + fma(y2, + fma(y2, 0.2091140262529164482568557e-3, 0.201562166026937652780575e-1), + 0.381641414288328849317962e0), + 0.822091273968539282568011e0); + + // 0.9 <= y <= 1 + double znm = fma(y2, + fma(y2, + fma(y2, -0.115475878996143396378318e-7, -0.165597043903549960486816e-3), + -0.146173047288731678404066e-1), + -0.227793870659088295252442e0); + + double zdm = fma(y2, + fma(y2, + fma(y2, 0.173076050126225961768710e-3, 0.167358775461896562588695e-1), + 0.317204558977294374244770e0), + 0.683381611977295894959554e0); + + int c = y < 0.9; + double zn = c ? znl : znm; + double zd = c ? zdl : zdm; + double z = y + y*y2 * MATH_DIVIDE(zn, zd); + + // y > 1 + double p = exp(2.0 * y) + 1.0; + double zg = 1.0 - 2.0 / p; + + z = y > 1.0 ? zg : z; + + // Other cases + z = y < 0x1.0p-28 | ax > PINFBITPATT_DP64 ? x : z; + + z = y > large_threshold ? 1.0 : z; + + return as_double(sx | as_ulong(z)); +} +
diff --git a/amd-builtins/math64/tanpiD.cl b/amd-builtins/math64/tanpiD.cl new file mode 100644 index 0000000..d2e12a2 --- /dev/null +++ b/amd-builtins/math64/tanpiD.cl
@@ -0,0 +1,86 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" +#include "tanD_piby4.h" + +__attribute__((overloadable)) double +tanpi(double x) +{ + const double pi = 3.1415926535897932384626433832795; + + long ix = as_long(x); + long xsgn = ix & 0x8000000000000000L; + long xnsgn = xsgn ^ 0x8000000000000000L; + ix ^= xsgn; + double ax = as_double(ix); + long iax = (long)ax; + double r = ax - iax; + long xodd = xsgn ^ (iax & 0x1 ? 0x8000000000000000L : 0L); + + // Initialize with return for +-Inf and NaN + long ir = 0x7ff8000000000000L; + + // 2^53 <= |x| < Inf, the result is always even integer + ir = ix < 0x7ff0000000000000L ? xsgn : ir; + + // 2^52 <= |x| < 2^53, the result is always integer + ir = ix < 0x4340000000000000L ? xodd : ir; + + // 0x1.0p-14 <= |x| < 2^53, result depends on which 0.25 interval + + // r < 1.0 + double a = 1.0 - r; + int e = 0; + long s = xnsgn; + + // r <= 0.75 + int c = r <= 0.75; + double t = r - 0.5; + a = c ? t : a; + e = c ? 1 : e; + s = c ? xsgn : s; + + // r < 0.5 + c = r < 0.5; + t = 0.5 - r; + a = c ? t : a; + s = c ? xnsgn : s; + + // r <= 0.25 + c = r <= 0.25; + a = c ? r : a; + e = c ? 0 : e; + s = c ? xsgn : s; + + double api = a * pi; + double2 tt = tan_piby4(api, 0.0); + long jr = s ^ as_long(e ? tt.hi : tt.lo); + + long si = xodd | 0x7ff0000000000000L; + jr = r == 0.5 ? si : jr; + + ir = ix < 0x4330000000000000L ? jr : ir; + + return as_double(ir); +} +
diff --git a/amd-builtins/math64/tgammaD.cl b/amd-builtins/math64/tgammaD.cl new file mode 100644 index 0000000..020cb66 --- /dev/null +++ b/amd-builtins/math64/tgammaD.cl
@@ -0,0 +1,44 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__((overloadable)) double +tgamma(double x) +{ + const double pi = 3.1415926535897932384626433832795; + double ax = fabs(x); + double lg = lgamma(ax); + double g = exp(lg); + + if (x < 0.0) + { + double z = sinpi(x); + g = g * ax * z; + g = pi / g; + g = g == 0 ? as_double(PINFBITPATT_DP64) : g; + g = z == 0 ? as_double(QNANBITPATT_DP64) : g; + } + + return g; +} +
diff --git a/amd-builtins/math64/truncD.cl b/amd-builtins/math64/truncD.cl new file mode 100644 index 0000000..1be69cb --- /dev/null +++ b/amd-builtins/math64/truncD.cl
@@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math64.h" + +__attribute__ ((overloadable, always_inline)) double +trunc(double x) +{ + return __amdil_round_zero_f64(x); +} +
diff --git a/amd-builtins/math64/vexpandD.cl b/amd-builtins/math64/vexpandD.cl new file mode 100644 index 0000000..4a797f6 --- /dev/null +++ b/amd-builtins/math64/vexpandD.cl
@@ -0,0 +1,907 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((overloadable, always_inline, weak)) double16 +frexp(double16 x, int16 *p) +{ + double16 r; + int16 i; + int8 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +frexp(double16 x, __global int16 *p) +{ + double16 r; + int16 i; + int8 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +frexp(double16 x, __local int16 *p) +{ + double16 r; + int16 i; + int8 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double8 +frexp(double8 x, int8 *p) +{ + double8 r; + int8 i; + int4 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +frexp(double8 x, __global int8 *p) +{ + double8 r; + int8 i; + int4 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +frexp(double8 x, __local int8 *p) +{ + double8 r; + int8 i; + int4 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double4 +frexp(double4 x, int4 *p) +{ + double4 r; + int4 i; + int2 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +frexp(double4 x, __global int4 *p) +{ + double4 r; + int4 i; + int2 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +frexp(double4 x, __local int4 *p) +{ + double4 r; + int4 i; + int2 j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double3 +frexp(double3 x, int3 *p) +{ + double3 r; + int3 i; + int2 j; + int k; + + r.s01 = frexp(x.s01, &j); + i.s01 = j; + r.s2 = frexp(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +frexp(double3 x, __global int3 *p) +{ + double3 r; + int3 i; + int2 j; + int k; + + r.s01 = frexp(x.s01, &j); + i.s01 = j; + r.s2 = frexp(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +frexp(double3 x, __local int3 *p) +{ + double3 r; + int3 i; + int2 j; + int k; + + r.s01 = frexp(x.s01, &j); + i.s01 = j; + r.s2 = frexp(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double2 +frexp(double2 x, int2 *p) +{ + double2 r; + int2 i; + int j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +frexp(double2 x, __global int2 *p) +{ + double2 r; + int2 i; + int j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +frexp(double2 x, __local int2 *p) +{ + double2 r; + int2 i; + int j; + + + r.lo = frexp(x.lo, &j); + i.lo = j; + r.hi = frexp(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double16 +lgamma_r(double16 x, int16 *p) +{ + double16 r; + int16 i; + int8 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +lgamma_r(double16 x, __global int16 *p) +{ + double16 r; + int16 i; + int8 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +lgamma_r(double16 x, __local int16 *p) +{ + double16 r; + int16 i; + int8 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double8 +lgamma_r(double8 x, int8 *p) +{ + double8 r; + int8 i; + int4 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +lgamma_r(double8 x, __global int8 *p) +{ + double8 r; + int8 i; + int4 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +lgamma_r(double8 x, __local int8 *p) +{ + double8 r; + int8 i; + int4 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double4 +lgamma_r(double4 x, int4 *p) +{ + double4 r; + int4 i; + int2 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +lgamma_r(double4 x, __global int4 *p) +{ + double4 r; + int4 i; + int2 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +lgamma_r(double4 x, __local int4 *p) +{ + double4 r; + int4 i; + int2 j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double3 +lgamma_r(double3 x, int3 *p) +{ + double3 r; + int3 i; + int2 j; + int k; + + r.s01 = lgamma_r(x.s01, &j); + i.s01 = j; + r.s2 = lgamma_r(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +lgamma_r(double3 x, __global int3 *p) +{ + double3 r; + int3 i; + int2 j; + int k; + + r.s01 = lgamma_r(x.s01, &j); + i.s01 = j; + r.s2 = lgamma_r(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +lgamma_r(double3 x, __local int3 *p) +{ + double3 r; + int3 i; + int2 j; + int k; + + r.s01 = lgamma_r(x.s01, &j); + i.s01 = j; + r.s2 = lgamma_r(x.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double2 +lgamma_r(double2 x, int2 *p) +{ + double2 r; + int2 i; + int j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +lgamma_r(double2 x, __global int2 *p) +{ + double2 r; + int2 i; + int j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +lgamma_r(double2 x, __local int2 *p) +{ + double2 r; + int2 i; + int j; + + + r.lo = lgamma_r(x.lo, &j); + i.lo = j; + r.hi = lgamma_r(x.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double16 +remquo(double16 x, double16 y, int16 *p) +{ + double16 r; + int16 i; + int8 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +remquo(double16 x, double16 y, __global int16 *p) +{ + double16 r; + int16 i; + int8 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +remquo(double16 x, double16 y, __local int16 *p) +{ + double16 r; + int16 i; + int8 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double8 +remquo(double8 x, double8 y, int8 *p) +{ + double8 r; + int8 i; + int4 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +remquo(double8 x, double8 y, __global int8 *p) +{ + double8 r; + int8 i; + int4 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +remquo(double8 x, double8 y, __local int8 *p) +{ + double8 r; + int8 i; + int4 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double4 +remquo(double4 x, double4 y, int4 *p) +{ + double4 r; + int4 i; + int2 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +remquo(double4 x, double4 y, __global int4 *p) +{ + double4 r; + int4 i; + int2 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +remquo(double4 x, double4 y, __local int4 *p) +{ + double4 r; + int4 i; + int2 j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double3 +remquo(double3 x, double3 y, int3 *p) +{ + double3 r; + int3 i; + int2 j; + int k; + + r.s01 = remquo(x.s01, y.s01, &j); + i.s01 = j; + r.s2 = remquo(x.s2, y.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +remquo(double3 x, double3 y, __global int3 *p) +{ + double3 r; + int3 i; + int2 j; + int k; + + r.s01 = remquo(x.s01, y.s01, &j); + i.s01 = j; + r.s2 = remquo(x.s2, y.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +remquo(double3 x, double3 y, __local int3 *p) +{ + double3 r; + int3 i; + int2 j; + int k; + + r.s01 = remquo(x.s01, y.s01, &j); + i.s01 = j; + r.s2 = remquo(x.s2, y.s2, &k); + i.s2 = k; + + *p = i; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double2 +remquo(double2 x, double2 y, int2 *p) +{ + double2 r; + int2 i; + int j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +remquo(double2 x, double2 y, __global int2 *p) +{ + double2 r; + int2 i; + int j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +remquo(double2 x, double2 y, __local int2 *p) +{ + double2 r; + int2 i; + int j; + + + r.lo = remquo(x.lo, y.lo, &j); + i.lo = j; + r.hi = remquo(x.hi, y.hi, &j); + i.hi = j; + + *p = i; + return r; +} + +#endif +
diff --git a/amd-builtins/math64/xvexpandD.cl b/amd-builtins/math64/xvexpandD.cl new file mode 100644 index 0000000..88b50c2 --- /dev/null +++ b/amd-builtins/math64/xvexpandD.cl
@@ -0,0 +1,908 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// XXX this file can be removed when clp is implemented + +__attribute__((overloadable, always_inline, weak)) double16 +fract(double16 x, double16 *p) +{ + double16 r; + double16 t; + double8 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +fract(double16 x, __global double16 *p) +{ + double16 r; + double16 t; + double8 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +fract(double16 x, __local double16 *p) +{ + double16 r; + double16 t; + double8 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double8 +fract(double8 x, double8 *p) +{ + double8 r; + double8 t; + double4 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +fract(double8 x, __global double8 *p) +{ + double8 r; + double8 t; + double4 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +fract(double8 x, __local double8 *p) +{ + double8 r; + double8 t; + double4 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double4 +fract(double4 x, double4 *p) +{ + double4 r; + double4 t; + double2 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +fract(double4 x, __global double4 *p) +{ + double4 r; + double4 t; + double2 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +fract(double4 x, __local double4 *p) +{ + double4 r; + double4 t; + double2 a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double3 +fract(double3 x, double3 *p) +{ + double3 r; + double3 t; + double2 a; + double b; + + r.s01 = fract(x.s01, &a); + t.s01 = a; + r.s2 = fract(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +fract(double3 x, __global double3 *p) +{ + double3 r; + double3 t; + double2 a; + double b; + + r.s01 = fract(x.s01, &a); + t.s01 = a; + r.s2 = fract(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +fract(double3 x, __local double3 *p) +{ + double3 r; + double3 t; + double2 a; + double b; + + r.s01 = fract(x.s01, &a); + t.s01 = a; + r.s2 = fract(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double2 +fract(double2 x, double2 *p) +{ + double2 r; + double2 t; + double a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +fract(double2 x, __global double2 *p) +{ + double2 r; + double2 t; + double a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +fract(double2 x, __local double2 *p) +{ + double2 r; + double2 t; + double a; + + + r.lo = fract(x.lo, &a); + t.lo = a; + r.hi = fract(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double16 +modf(double16 x, double16 *p) +{ + double16 r; + double16 t; + double8 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +modf(double16 x, __global double16 *p) +{ + double16 r; + double16 t; + double8 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +modf(double16 x, __local double16 *p) +{ + double16 r; + double16 t; + double8 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double8 +modf(double8 x, double8 *p) +{ + double8 r; + double8 t; + double4 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +modf(double8 x, __global double8 *p) +{ + double8 r; + double8 t; + double4 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +modf(double8 x, __local double8 *p) +{ + double8 r; + double8 t; + double4 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double4 +modf(double4 x, double4 *p) +{ + double4 r; + double4 t; + double2 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +modf(double4 x, __global double4 *p) +{ + double4 r; + double4 t; + double2 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +modf(double4 x, __local double4 *p) +{ + double4 r; + double4 t; + double2 a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double3 +modf(double3 x, double3 *p) +{ + double3 r; + double3 t; + double2 a; + double b; + + r.s01 = modf(x.s01, &a); + t.s01 = a; + r.s2 = modf(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +modf(double3 x, __global double3 *p) +{ + double3 r; + double3 t; + double2 a; + double b; + + r.s01 = modf(x.s01, &a); + t.s01 = a; + r.s2 = modf(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +modf(double3 x, __local double3 *p) +{ + double3 r; + double3 t; + double2 a; + double b; + + r.s01 = modf(x.s01, &a); + t.s01 = a; + r.s2 = modf(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double2 +modf(double2 x, double2 *p) +{ + double2 r; + double2 t; + double a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +modf(double2 x, __global double2 *p) +{ + double2 r; + double2 t; + double a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +modf(double2 x, __local double2 *p) +{ + double2 r; + double2 t; + double a; + + + r.lo = modf(x.lo, &a); + t.lo = a; + r.hi = modf(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double16 +sincos(double16 x, double16 *p) +{ + double16 r; + double16 t; + double8 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +sincos(double16 x, __global double16 *p) +{ + double16 r; + double16 t; + double8 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double16 +sincos(double16 x, __local double16 *p) +{ + double16 r; + double16 t; + double8 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double8 +sincos(double8 x, double8 *p) +{ + double8 r; + double8 t; + double4 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +sincos(double8 x, __global double8 *p) +{ + double8 r; + double8 t; + double4 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double8 +sincos(double8 x, __local double8 *p) +{ + double8 r; + double8 t; + double4 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double4 +sincos(double4 x, double4 *p) +{ + double4 r; + double4 t; + double2 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +sincos(double4 x, __global double4 *p) +{ + double4 r; + double4 t; + double2 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double4 +sincos(double4 x, __local double4 *p) +{ + double4 r; + double4 t; + double2 a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double3 +sincos(double3 x, double3 *p) +{ + double3 r; + double3 t; + double2 a; + double b; + + r.s01 = sincos(x.s01, &a); + t.s01 = a; + r.s2 = sincos(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +sincos(double3 x, __global double3 *p) +{ + double3 r; + double3 t; + double2 a; + double b; + + r.s01 = sincos(x.s01, &a); + t.s01 = a; + r.s2 = sincos(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double3 +sincos(double3 x, __local double3 *p) +{ + double3 r; + double3 t; + double2 a; + double b; + + r.s01 = sincos(x.s01, &a); + t.s01 = a; + r.s2 = sincos(x.s2, &b); + t.s2 = b; + + *p = t; + return r; +} + +#endif + +__attribute__((overloadable, always_inline, weak)) double2 +sincos(double2 x, double2 *p) +{ + double2 r; + double2 t; + double a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +sincos(double2 x, __global double2 *p) +{ + double2 r; + double2 t; + double a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif + +#ifndef __clang__ + +__attribute__((overloadable, always_inline, weak)) double2 +sincos(double2 x, __local double2 *p) +{ + double2 r; + double2 t; + double a; + + + r.lo = sincos(x.lo, &a); + t.lo = a; + r.hi = sincos(x.hi, &a); + t.hi = a; + + *p = t; + return r; +} + +#endif
diff --git a/amd-builtins/media/bfe.cl b/amd-builtins/media/bfe.cl new file mode 100644 index 0000000..715f893 --- /dev/null +++ b/amd-builtins/media/bfe.cl
@@ -0,0 +1,151 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "media.h" + +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable + +__attribute__((overloadable,always_inline,const)) uint2 amd_bfe(uint2 v1, uint2 v2, uint2 v3) +{ + uint2 ret; + ret.x = __hsail_bfe(v1.x,v2.x, v3.x); + ret.y = __hsail_bfe(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint3 amd_bfe(uint3 v1, uint3 v2, uint3 v3) +{ + uint3 ret; + ret.x = __hsail_bfe(v1.x,v2.x, v3.x); + ret.y = __hsail_bfe(v1.y,v2.y,v3.y); + ret.z = __hsail_bfe(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint4 amd_bfe(uint4 v1, uint4 v2, uint4 v3) +{ + uint4 ret; + ret.x = __hsail_bfe(v1.x,v2.x, v3.x); + ret.y = __hsail_bfe(v1.y,v2.y,v3.y); + ret.z = __hsail_bfe(v1.z,v2.z, v3.z); + ret.w = __hsail_bfe(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint8 amd_bfe(uint8 v1, uint8 v2, uint8 v3) +{ + uint8 ret; + ret.s0 = __hsail_bfe(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_bfe(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_bfe(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_bfe(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_bfe(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_bfe(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_bfe(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_bfe(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint16 amd_bfe(uint16 v1, uint16 v2, uint16 v3) +{ + uint16 ret; + ret.s0 = __hsail_bfe(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_bfe(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_bfe(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_bfe(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_bfe(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_bfe(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_bfe(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_bfe(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_bfe(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_bfe(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_bfe(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_bfe(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_bfe(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_bfe(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_bfe(v1.se,v2.se, v3.se); + ret.sf= __hsail_bfe(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) uint amd_bfe(uint v1, uint v2, uint v3) +{ + return __hsail_bfe(v1,v2,v3); +} +__attribute__((overloadable,always_inline,const)) int2 amd_bfe(int2 v1, uint2 v2, uint2 v3) +{ + int2 ret; + ret.x = __hsail_ibfe(v1.x,v2.x, v3.x); + ret.y = __hsail_ibfe(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) int3 amd_bfe(int3 v1, uint3 v2, uint3 v3) +{ + int3 ret; + ret.x = __hsail_ibfe(v1.x,v2.x, v3.x); + ret.y = __hsail_ibfe(v1.y,v2.y,v3.y); + ret.z = __hsail_ibfe(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) int4 amd_bfe(int4 v1, uint4 v2, uint4 v3) +{ + int4 ret; + ret.x = __hsail_ibfe(v1.x,v2.x, v3.x); + ret.y = __hsail_ibfe(v1.y,v2.y,v3.y); + ret.z = __hsail_ibfe(v1.z,v2.z, v3.z); + ret.w = __hsail_ibfe(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) int8 amd_bfe(int8 v1, uint8 v2, uint8 v3) +{ + int8 ret; + ret.s0 = __hsail_ibfe(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_ibfe(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_ibfe(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_ibfe(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_ibfe(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_ibfe(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_ibfe(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_ibfe(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) int16 amd_bfe(int16 v1, uint16 v2, uint16 v3) +{ + int16 ret; + ret.s0 = __hsail_ibfe(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_ibfe(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_ibfe(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_ibfe(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_ibfe(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_ibfe(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_ibfe(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_ibfe(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_ibfe(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_ibfe(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_ibfe(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_ibfe(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_ibfe(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_ibfe(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_ibfe(v1.se,v2.se, v3.se); + ret.sf= __hsail_ibfe(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) int amd_bfe(int v1, uint v2, uint v3) +{ + return __hsail_ibfe(v1,v2,v3); +}
diff --git a/amd-builtins/media/bfm.cl b/amd-builtins/media/bfm.cl new file mode 100644 index 0000000..5c77007 --- /dev/null +++ b/amd-builtins/media/bfm.cl
@@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "media.h" + +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable + +__attribute__((overloadable,always_inline,const)) uint2 amd_bfm(uint2 v1, uint2 v2 ) +{ + uint2 ret; + ret.x = __hsail_bfm(v1.x,v2.x); + ret.y = __hsail_bfm(v1.y,v2.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint3 amd_bfm(uint3 v1, uint3 v2) +{ + uint3 ret; + ret.x = __hsail_bfm(v1.x,v2.x); + ret.y = __hsail_bfm(v1.y,v2.y); + ret.z = __hsail_bfm(v1.z,v2.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint4 amd_bfm(uint4 v1, uint4 v2) +{ + uint4 ret; + ret.x = __hsail_bfm(v1.x,v2.x); + ret.y = __hsail_bfm(v1.y,v2.y); + ret.z = __hsail_bfm(v1.z,v2.z); + ret.w = __hsail_bfm(v1.w,v2.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint8 amd_bfm(uint8 v1, uint8 v2) +{ + uint8 ret; + ret.s0 = __hsail_bfm(v1.s0,v2.s0); + ret.s1 = __hsail_bfm(v1.s1,v2.s1); + ret.s2 = __hsail_bfm(v1.s2,v2.s2); + ret.s3 = __hsail_bfm(v1.s3,v2.s3); + ret.s4 = __hsail_bfm(v1.s4,v2.s4) ; + ret.s5 = __hsail_bfm(v1.s5,v2.s5); + ret.s6 = __hsail_bfm(v1.s6,v2.s6 ); + ret.s7 = __hsail_bfm(v1.s7,v2.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint16 amd_bfm(uint16 v1, uint16 v2) +{ + uint16 ret; + ret.s0 = __hsail_bfm(v1.s0,v2.s0); + ret.s1 = __hsail_bfm(v1.s1,v2.s1); + ret.s2 = __hsail_bfm(v1.s2,v2.s2); + ret.s3 = __hsail_bfm(v1.s3,v2.s3); + ret.s4 = __hsail_bfm(v1.s4,v2.s4) ; + ret.s5 = __hsail_bfm(v1.s5,v2.s5); + ret.s6 = __hsail_bfm(v1.s6,v2.s6); + ret.s7 = __hsail_bfm(v1.s7,v2.s7); + ret.s8 = __hsail_bfm(v1.s8,v2.s8 ); + ret.s9 = __hsail_bfm(v1.s9,v2.s9); + ret.sa = __hsail_bfm(v1.sa,v2.sa); + ret.sb = __hsail_bfm(v1.sb,v2.sb); + ret.sc = __hsail_bfm(v1.sc,v2.sc); + ret.sd = __hsail_bfm(v1.sd,v2.sd); + ret.se = __hsail_bfm(v1.se,v2.se); + ret.sf= __hsail_bfm(v1.sf,v2.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) uint amd_bfm(uint v1, uint v2) +{ + return __hsail_bfm(v1,v2); +}
diff --git a/amd-builtins/media/bitalign.cl b/amd-builtins/media/bitalign.cl new file mode 100644 index 0000000..c00f1b1 --- /dev/null +++ b/amd-builtins/media/bitalign.cl
@@ -0,0 +1,99 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "media.h" + +__attribute__((overloadable, always_inline)) uint +amd_bitalign(uint a, uint b, uint c) +{ + return __hsail_bitalign_b32(a, b, c); +} + +__attribute__((overloadable, always_inline)) uint2 +amd_bitalign(uint2 a, uint2 b, uint2 c) +{ + uint2 ret; + ret.x = __hsail_bitalign_b32(a.x, b.x, c.x); + ret.y = __hsail_bitalign_b32(a.y, b.y, c.y); + return ret; +} + +__attribute__((overloadable, always_inline)) uint3 +amd_bitalign(uint3 a, uint3 b, uint3 c) +{ + uint3 ret; + ret.x = __hsail_bitalign_b32(a.x, b.x, c.x); + ret.y = __hsail_bitalign_b32(a.y, b.y, c.y); + ret.z = __hsail_bitalign_b32(a.z, b.z, c.z); + return ret; + +} + +__attribute__((overloadable, always_inline)) uint4 +amd_bitalign(uint4 a, uint4 b, uint4 c) +{ + uint4 ret; + ret.x = __hsail_bitalign_b32(a.x, b.x, c.x); + ret.y = __hsail_bitalign_b32(a.y, b.y, c.y); + ret.z = __hsail_bitalign_b32(a.z, b.z, c.z); + ret.w = __hsail_bitalign_b32(a.w, b.w, c.w); + return ret; +} + +__attribute__((overloadable, always_inline)) uint8 +amd_bitalign(uint8 a, uint8 b, uint8 c) +{ + uint8 ret; + ret.s0 = __hsail_bitalign_b32(a.s0, b.s0, c.s0); + ret.s1 = __hsail_bitalign_b32(a.s1, b.s1, c.s1); + ret.s2 = __hsail_bitalign_b32(a.s2, b.s2, c.s2); + ret.s3 = __hsail_bitalign_b32(a.s3, b.s3, c.s3); + ret.s4 = __hsail_bitalign_b32(a.s4, b.s4, c.s4); + ret.s5 = __hsail_bitalign_b32(a.s5, b.s5, c.s5); + ret.s6 = __hsail_bitalign_b32(a.s6, b.s6, c.s6); + ret.s7 = __hsail_bitalign_b32(a.s7, b.s7, c.s7); + return ret; +} + +__attribute__((overloadable, always_inline)) uint16 +amd_bitalign(uint16 a, uint16 b, uint16 c) +{ + uint16 ret; + ret.s0 = __hsail_bitalign_b32(a.s0, b.s0, c.s0); + ret.s1 = __hsail_bitalign_b32(a.s1, b.s1, c.s1); + ret.s2 = __hsail_bitalign_b32(a.s2, b.s2, c.s2); + ret.s3 = __hsail_bitalign_b32(a.s3, b.s3, c.s3); + ret.s4 = __hsail_bitalign_b32(a.s4, b.s4, c.s4); + ret.s5 = __hsail_bitalign_b32(a.s5, b.s5, c.s5); + ret.s6 = __hsail_bitalign_b32(a.s6, b.s6, c.s6); + ret.s7 = __hsail_bitalign_b32(a.s7, b.s7, c.s7); + ret.s8 = __hsail_bitalign_b32(a.s8, b.s8, c.s8); + ret.s9 = __hsail_bitalign_b32(a.s9, b.s9, c.s9); + ret.sa = __hsail_bitalign_b32(a.sa, b.sa, c.sa); + ret.sb = __hsail_bitalign_b32(a.sb, b.sb, c.sb); + ret.sc = __hsail_bitalign_b32(a.sc, b.sc, c.sc); + ret.sd = __hsail_bitalign_b32(a.sd, b.sd, c.sd); + ret.se = __hsail_bitalign_b32(a.se, b.se, c.se); + ret.sf = __hsail_bitalign_b32(a.sf, b.sf, c.sf); + return ret; +} +
diff --git a/amd-builtins/media/bytealign.cl b/amd-builtins/media/bytealign.cl new file mode 100644 index 0000000..19798fc --- /dev/null +++ b/amd-builtins/media/bytealign.cl
@@ -0,0 +1,100 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "media.h" + +__attribute__((overloadable, always_inline)) uint +amd_bytealign(uint a, uint b, uint c) +{ + return __hsail_bytealign_b32(a, b, c); +} + +__attribute__((overloadable, always_inline)) uint2 +amd_bytealign(uint2 a, uint2 b, uint2 c) +{ + uint2 ret; + ret.x = __hsail_bytealign_b32(a.x, b.x, c.x); + ret.y = __hsail_bytealign_b32(a.y, b.y, c.y); + return ret; +} + +__attribute__((overloadable, always_inline)) uint3 +amd_bytealign(uint3 a, uint3 b, uint3 c) +{ + + uint3 ret; + ret.x = __hsail_bytealign_b32(a.x, b.x, c.x); + ret.y = __hsail_bytealign_b32(a.y, b.y, c.y); + ret.z = __hsail_bytealign_b32(a.z, b.z, c.z); + return ret; + +} + +__attribute__((overloadable, always_inline)) uint4 +amd_bytealign(uint4 a, uint4 b, uint4 c) +{ + uint4 ret; + ret.x = __hsail_bytealign_b32(a.x, b.x, c.x); + ret.y = __hsail_bytealign_b32(a.y, b.y, c.y); + ret.z = __hsail_bytealign_b32(a.z, b.z, c.z); + ret.w = __hsail_bytealign_b32(a.w, b.w, c.w); + return ret; +} + +__attribute__((overloadable, always_inline)) uint8 +amd_bytealign(uint8 a, uint8 b, uint8 c) +{ + uint8 ret; + ret.s0 = __hsail_bytealign_b32(a.s0, b.s0, c.s0); + ret.s1 = __hsail_bytealign_b32(a.s1, b.s1, c.s1); + ret.s2 = __hsail_bytealign_b32(a.s2, b.s2, c.s2); + ret.s3 = __hsail_bytealign_b32(a.s3, b.s3, c.s3); + ret.s4 = __hsail_bytealign_b32(a.s4, b.s4, c.s4); + ret.s5 = __hsail_bytealign_b32(a.s5, b.s5, c.s5); + ret.s6 = __hsail_bytealign_b32(a.s6, b.s6, c.s6); + ret.s7 = __hsail_bytealign_b32(a.s7, b.s7, c.s7); + return ret; +} + +__attribute__((overloadable, always_inline)) uint16 +amd_bytealign(uint16 a, uint16 b, uint16 c) +{ + uint16 ret; + ret.s0 = __hsail_bytealign_b32(a.s0, b.s0, c.s0); + ret.s1 = __hsail_bytealign_b32(a.s1, b.s1, c.s1); + ret.s2 = __hsail_bytealign_b32(a.s2, b.s2, c.s2); + ret.s3 = __hsail_bytealign_b32(a.s3, b.s3, c.s3); + ret.s4 = __hsail_bytealign_b32(a.s4, b.s4, c.s4); + ret.s5 = __hsail_bytealign_b32(a.s5, b.s5, c.s5); + ret.s6 = __hsail_bytealign_b32(a.s6, b.s6, c.s6); + ret.s7 = __hsail_bytealign_b32(a.s7, b.s7, c.s7); + ret.s8 = __hsail_bytealign_b32(a.s8, b.s8, c.s8); + ret.s9 = __hsail_bytealign_b32(a.s9, b.s9, c.s9); + ret.sa = __hsail_bytealign_b32(a.sa, b.sa, c.sa); + ret.sb = __hsail_bytealign_b32(a.sb, b.sb, c.sb); + ret.sc = __hsail_bytealign_b32(a.sc, b.sc, c.sc); + ret.sd = __hsail_bytealign_b32(a.sd, b.sd, c.sd); + ret.se = __hsail_bytealign_b32(a.se, b.se, c.se); + ret.sf = __hsail_bytealign_b32(a.sf, b.sf, c.sf); + return ret; +} +
diff --git a/amd-builtins/media/lerp.cl b/amd-builtins/media/lerp.cl new file mode 100644 index 0000000..76bd587 --- /dev/null +++ b/amd-builtins/media/lerp.cl
@@ -0,0 +1,100 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "media.h" + +__attribute__((overloadable, always_inline)) uint +amd_lerp(uint a, uint b, uint c) +{ + return __hsail_lerp_u8x4(a, b, c); +} + +__attribute__((overloadable, always_inline)) uint2 +amd_lerp(uint2 a, uint2 b, uint2 c) +{ + uint2 ret; + ret.x = __hsail_lerp_u8x4(a.x, b.x, c.x); + ret.y = __hsail_lerp_u8x4(a.y, b.y, c.y); + return ret; +} + +__attribute__((overloadable, always_inline)) uint3 +amd_lerp(uint3 a, uint3 b, uint3 c) +{ + + uint3 ret; + ret.x = __hsail_lerp_u8x4(a.x, b.x, c.x); + ret.y = __hsail_lerp_u8x4(a.y, b.y, c.y); + ret.z = __hsail_lerp_u8x4(a.z, b.z, c.z); + return ret; + +} + +__attribute__((overloadable, always_inline)) uint4 +amd_lerp(uint4 a, uint4 b, uint4 c) +{ + uint4 ret; + ret.x = __hsail_lerp_u8x4(a.x, b.x, c.x); + ret.y = __hsail_lerp_u8x4(a.y, b.y, c.y); + ret.z = __hsail_lerp_u8x4(a.z, b.z, c.z); + ret.w = __hsail_lerp_u8x4(a.w, b.w, c.w); + return ret; +} + +__attribute__((overloadable, always_inline)) uint8 +amd_lerp(uint8 a, uint8 b, uint8 c) +{ + uint8 ret; + ret.s0 = __hsail_lerp_u8x4(a.s0, b.s0, c.s0); + ret.s1 = __hsail_lerp_u8x4(a.s1, b.s1, c.s1); + ret.s2 = __hsail_lerp_u8x4(a.s2, b.s2, c.s2); + ret.s3 = __hsail_lerp_u8x4(a.s3, b.s3, c.s3); + ret.s4 = __hsail_lerp_u8x4(a.s4, b.s4, c.s4); + ret.s5 = __hsail_lerp_u8x4(a.s5, b.s5, c.s5); + ret.s6 = __hsail_lerp_u8x4(a.s6, b.s6, c.s6); + ret.s7 = __hsail_lerp_u8x4(a.s7, b.s7, c.s7); + return ret; +} + +__attribute__((overloadable, always_inline)) uint16 +amd_lerp(uint16 a, uint16 b, uint16 c) +{ + uint16 ret; + ret.s0 = __hsail_lerp_u8x4(a.s0, b.s0, c.s0); + ret.s1 = __hsail_lerp_u8x4(a.s1, b.s1, c.s1); + ret.s2 = __hsail_lerp_u8x4(a.s2, b.s2, c.s2); + ret.s3 = __hsail_lerp_u8x4(a.s3, b.s3, c.s3); + ret.s4 = __hsail_lerp_u8x4(a.s4, b.s4, c.s4); + ret.s5 = __hsail_lerp_u8x4(a.s5, b.s5, c.s5); + ret.s6 = __hsail_lerp_u8x4(a.s6, b.s6, c.s6); + ret.s7 = __hsail_lerp_u8x4(a.s7, b.s7, c.s7); + ret.s8 = __hsail_lerp_u8x4(a.s8, b.s8, c.s8); + ret.s9 = __hsail_lerp_u8x4(a.s9, b.s9, c.s9); + ret.sa = __hsail_lerp_u8x4(a.sa, b.sa, c.sa); + ret.sb = __hsail_lerp_u8x4(a.sb, b.sb, c.sb); + ret.sc = __hsail_lerp_u8x4(a.sc, b.sc, c.sc); + ret.sd = __hsail_lerp_u8x4(a.sd, b.sd, c.sd); + ret.se = __hsail_lerp_u8x4(a.se, b.se, c.se); + ret.sf = __hsail_lerp_u8x4(a.sf, b.sf, c.sf); + return ret; +} +
diff --git a/amd-builtins/media/max3.cl b/amd-builtins/media/max3.cl new file mode 100644 index 0000000..75b0f48 --- /dev/null +++ b/amd-builtins/media/max3.cl
@@ -0,0 +1,215 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "media.h" + +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable + +__attribute__((overloadable,always_inline,const)) uint2 amd_max3(uint2 v1, uint2 v2, uint2 v3) +{ + uint2 ret; + ret.x = __hsail_umax3(v1.x,v2.x, v3.x); + ret.y = __hsail_umax3(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint3 amd_max3(uint3 v1, uint3 v2, uint3 v3) +{ + uint3 ret; + ret.x = __hsail_umax3(v1.x,v2.x, v3.x); + ret.y = __hsail_umax3(v1.y,v2.y,v3.y); + ret.z = __hsail_umax3(v1.z,v2.z,v3.z); + return ret; +} + +__attribute__((overloadable,always_inline,const)) uint4 amd_max3(uint4 v1, uint4 v2, uint4 v3) +{ + uint4 ret; + ret.x = __hsail_umax3(v1.x,v2.x, v3.x); + ret.y = __hsail_umax3(v1.y,v2.y,v3.y); + ret.z = __hsail_umax3(v1.z,v2.z, v3.z); + ret.w = __hsail_umax3(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint8 amd_max3(uint8 v1, uint8 v2, uint8 v3) +{ + uint8 ret; + ret.s0 = __hsail_umax3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_umax3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_umax3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_umax3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_umax3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_umax3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_umax3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_umax3(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint16 amd_max3(uint16 v1, uint16 v2, uint16 v3) +{ + uint16 ret; + ret.s0 = __hsail_umax3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_umax3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_umax3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_umax3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_umax3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_umax3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_umax3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_umax3(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_umax3(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_umax3(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_umax3(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_umax3(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_umax3(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_umax3(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_umax3(v1.se,v2.se, v3.se); + ret.sf= __hsail_umax3(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) uint amd_max3(uint v1, uint v2, uint v3) +{ + return __hsail_umax3(v1,v2,v3) ; +} +__attribute__((overloadable,always_inline,const)) float2 amd_max3(float2 v1, float2 v2, float2 v3) +{ + float2 ret; + ret.x = __hsail_f32_max3(v1.x,v2.x, v3.x); + ret.y = __hsail_f32_max3(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) float3 amd_max3(float3 v1, float3 v2, float3 v3) +{ + float3 ret; + ret.x = __hsail_f32_max3(v1.x,v2.x, v3.x); + ret.y = __hsail_f32_max3(v1.y,v2.y,v3.y); + ret.z = __hsail_f32_max3(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) float4 amd_max3(float4 v1, float4 v2, float4 v3) +{ + float4 ret; + ret.x = __hsail_f32_max3(v1.x,v2.x, v3.x); + ret.y = __hsail_f32_max3(v1.y,v2.y,v3.y); + ret.z = __hsail_f32_max3(v1.z,v2.z, v3.z); + ret.w = __hsail_f32_max3(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) float8 amd_max3(float8 v1, float8 v2, float8 v3) +{ + float8 ret; + ret.s0 = __hsail_f32_max3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_f32_max3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_f32_max3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_f32_max3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_f32_max3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_f32_max3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_f32_max3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_f32_max3(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) float16 amd_max3(float16 v1, float16 v2, float16 v3) +{ + float16 ret; + ret.s0 = __hsail_f32_max3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_f32_max3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_f32_max3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_f32_max3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_f32_max3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_f32_max3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_f32_max3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_f32_max3(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_f32_max3(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_f32_max3(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_f32_max3(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_f32_max3(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_f32_max3(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_f32_max3(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_f32_max3(v1.se,v2.se, v3.se); + ret.sf= __hsail_f32_max3(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) float amd_max3(float v1, float v2, float v3) +{ + return __hsail_f32_max3(v1,v2,v3); +} +__attribute__((overloadable,always_inline,const)) int2 amd_max3(int2 v1, int2 v2, int2 v3) +{ + int2 ret; + ret.x = __hsail_imax3(v1.x,v2.x, v3.x); + ret.y = __hsail_imax3(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) int3 amd_max3(int3 v1, int3 v2, int3 v3) +{ + int3 ret; + ret.x = __hsail_imax3(v1.x,v2.x, v3.x); + ret.y = __hsail_imax3(v1.y,v2.y,v3.y); + ret.z = __hsail_imax3(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) int4 amd_max3(int4 v1, int4 v2, int4 v3) +{ + int4 ret; + ret.x = __hsail_imax3(v1.x,v2.x, v3.x); + ret.y = __hsail_imax3(v1.y,v2.y,v3.y); + ret.z = __hsail_imax3(v1.z,v2.z, v3.z); + ret.w = __hsail_imax3(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) int8 amd_max3(int8 v1, int8 v2, int8 v3) +{ + int8 ret; + ret.s0 = __hsail_imax3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_imax3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_imax3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_imax3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_imax3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_imax3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_imax3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_imax3(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) int16 amd_max3(int16 v1, int16 v2, int16 v3) +{ + int16 ret; + ret.s0 = __hsail_imax3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_imax3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_imax3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_imax3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_imax3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_imax3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_imax3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_imax3(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_imax3(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_imax3(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_imax3(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_imax3(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_imax3(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_imax3(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_imax3(v1.se,v2.se, v3.se); + ret.sf= __hsail_imax3(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) int amd_max3(int v1, int v2, int v3) +{ + return __hsail_imax3(v1,v2,v3); +}
diff --git a/amd-builtins/media/media.h b/amd-builtins/media/media.h new file mode 100644 index 0000000..eb12c55 --- /dev/null +++ b/amd-builtins/media/media.h
@@ -0,0 +1,70 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#pragma OPENCL EXTENSION cl_amd_media_ops : enable + +extern __attribute__((const)) uint __hsail_bitalign_b32(uint, uint, uint); + +extern __attribute__((const)) uint __hsail_bytealign_b32(uint, uint, uint); + +extern __attribute__((pure)) uint __hsail_packcvt_u8x4_f32(float,float,float,float); + +extern __attribute__((pure)) uint __hsail_lerp_u8x4(uint,uint,uint); + +extern __attribute__((pure)) uint __hsail_sad_u32_u8x4(uint,uint,uint); + +extern __attribute__((pure)) uint __hsail_sadhi_u16x2_u8x4(uint,uint,uint); + +extern __attribute__((pure)) float __hsail_unpackcvt_f32_u8x4(uint,uint); + +extern __attribute__((const)) uint __hsail_msad(uint,uint,uint); + +extern __attribute__((const)) uint __hsail_sadd(uint,uint,uint); + +extern __attribute__((const)) uint __hsail_sadw(uint,uint,uint); + +extern __attribute__((const)) uint __hsail_umin3(uint,uint,uint); + +extern __attribute__((const)) int __hsail_imin3(int,int,int); + +extern __attribute__((const)) uint __hsail_umax3(uint,uint,uint); + +extern __attribute__((const)) int __hsail_imax3(int,int,int); + +extern __attribute__((const)) uint __hsail_umedian3(uint,uint,uint); + +extern __attribute__((const)) int __hsail_imedian3(int,int,int); + +extern __attribute__((const)) uint __hsail_bfe(uint,uint,uint); + +extern __attribute__((const)) float __hsail_f32_min3(float,float,float); + +extern __attribute__((const)) float __hsail_f32_max3(float,float,float); + +extern __attribute__((const)) float __hsail_f32_median3(float,float,float); + +extern __attribute__((const)) ulong __hsail_mqsad(ulong,uint,ulong); + +extern __attribute__((const)) ulong __hsail_qsad(ulong,uint,ulong); + +extern __attribute__((const)) uint __hsail_bfm(uint,uint); + +extern __attribute__((const)) int __hsail_ibfe(int,uint,uint);
diff --git a/amd-builtins/media/median3.cl b/amd-builtins/media/median3.cl new file mode 100644 index 0000000..e446538 --- /dev/null +++ b/amd-builtins/media/median3.cl
@@ -0,0 +1,215 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "media.h" + +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable + +__attribute__((overloadable,always_inline,const)) uint2 amd_median3(uint2 v1, uint2 v2, uint2 v3) +{ + uint2 ret; + ret.x = __hsail_umedian3(v1.x,v2.x, v3.x); + ret.y = __hsail_umedian3(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint3 amd_median3(uint3 v1, uint3 v2, uint3 v3) +{ + uint3 ret; + ret.x = __hsail_umedian3(v1.x,v2.x, v3.x); + ret.y = __hsail_umedian3(v1.y,v2.y,v3.y); + ret.z = __hsail_umedian3(v1.z,v2.z, v3.z); + return ret; +} + +__attribute__((overloadable,always_inline,const)) uint4 amd_median3(uint4 v1, uint4 v2, uint4 v3) +{ + uint4 ret; + ret.x = __hsail_umedian3(v1.x,v2.x, v3.x); + ret.y = __hsail_umedian3(v1.y,v2.y,v3.y); + ret.z = __hsail_umedian3(v1.z,v2.z, v3.z); + ret.w = __hsail_umedian3(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint8 amd_median3(uint8 v1, uint8 v2, uint8 v3) +{ + uint8 ret; + ret.s0 = __hsail_umedian3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_umedian3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_umedian3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_umedian3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_umedian3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_umedian3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_umedian3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_umedian3(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint16 amd_median3(uint16 v1, uint16 v2, uint16 v3) +{ + uint16 ret; + ret.s0 = __hsail_umedian3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_umedian3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_umedian3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_umedian3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_umedian3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_umedian3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_umedian3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_umedian3(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_umedian3(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_umedian3(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_umedian3(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_umedian3(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_umedian3(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_umedian3(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_umedian3(v1.se,v2.se, v3.se); + ret.sf= __hsail_umedian3(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) uint amd_median3(uint v1, uint v2, uint v3) +{ + return __hsail_umedian3(v1,v2,v3) ; +} +__attribute__((overloadable,always_inline,const)) float2 amd_median3(float2 v1, float2 v2, float2 v3) +{ + float2 ret; + ret.x = __hsail_f32_median3(v1.x,v2.x, v3.x); + ret.y = __hsail_f32_median3(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) float3 amd_median3(float3 v1, float3 v2, float3 v3) +{ + float3 ret; + ret.x = __hsail_f32_median3(v1.x,v2.x, v3.x); + ret.y = __hsail_f32_median3(v1.y,v2.y,v3.y); + ret.z = __hsail_f32_median3(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) float4 amd_median3(float4 v1, float4 v2, float4 v3) +{ + float4 ret; + ret.x = __hsail_f32_median3(v1.x,v2.x, v3.x); + ret.y = __hsail_f32_median3(v1.y,v2.y,v3.y); + ret.z = __hsail_f32_median3(v1.z,v2.z, v3.z); + ret.w = __hsail_f32_median3(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) float8 amd_median3(float8 v1, float8 v2, float8 v3) +{ + float8 ret; + ret.s0 = __hsail_f32_median3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_f32_median3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_f32_median3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_f32_median3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_f32_median3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_f32_median3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_f32_median3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_f32_median3(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) float16 amd_median3(float16 v1, float16 v2, float16 v3) +{ + float16 ret; + ret.s0 = __hsail_f32_median3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_f32_median3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_f32_median3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_f32_median3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_f32_median3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_f32_median3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_f32_median3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_f32_median3(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_f32_median3(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_f32_median3(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_f32_median3(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_f32_median3(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_f32_median3(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_f32_median3(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_f32_median3(v1.se,v2.se, v3.se); + ret.sf= __hsail_f32_median3(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) float amd_median3(float v1, float v2, float v3) +{ + return __hsail_f32_median3(v1,v2,v3); +} +__attribute__((overloadable,always_inline,const)) int2 amd_median3(int2 v1, int2 v2, int2 v3) +{ + int2 ret; + ret.x = __hsail_imedian3(v1.x,v2.x, v3.x); + ret.y = __hsail_imedian3(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) int3 amd_median3(int3 v1, int3 v2, int3 v3) +{ + int3 ret; + ret.x = __hsail_imedian3(v1.x,v2.x, v3.x); + ret.y = __hsail_imedian3(v1.y,v2.y,v3.y); + ret.z = __hsail_imedian3(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) int4 amd_median3(int4 v1, int4 v2, int4 v3) +{ + int4 ret; + ret.x = __hsail_imedian3(v1.x,v2.x, v3.x); + ret.y = __hsail_imedian3(v1.y,v2.y,v3.y); + ret.z = __hsail_imedian3(v1.z,v2.z, v3.z); + ret.w = __hsail_imedian3(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) int8 amd_median3(int8 v1, int8 v2, int8 v3) +{ + int8 ret; + ret.s0 = __hsail_imedian3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_imedian3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_imedian3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_imedian3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_imedian3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_imedian3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_imedian3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_imedian3(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) int16 amd_median3(int16 v1, int16 v2, int16 v3) +{ + int16 ret; + ret.s0 = __hsail_imedian3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_imedian3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_imedian3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_imedian3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_imedian3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_imedian3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_imedian3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_imedian3(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_imedian3(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_imedian3(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_imedian3(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_imedian3(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_imedian3(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_imedian3(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_imedian3(v1.se,v2.se, v3.se); + ret.sf= __hsail_imedian3(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) int amd_median3(int v1, int v2, int v3) +{ + return __hsail_imedian3(v1,v2,v3); +}
diff --git a/amd-builtins/media/min3.cl b/amd-builtins/media/min3.cl new file mode 100644 index 0000000..90901db --- /dev/null +++ b/amd-builtins/media/min3.cl
@@ -0,0 +1,215 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "media.h" + +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable + +__attribute__((overloadable,always_inline,const)) uint2 amd_min3(uint2 v1, uint2 v2, uint2 v3) +{ + uint2 ret; + ret.x = __hsail_umin3(v1.x,v2.x, v3.x); + ret.y = __hsail_umin3(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint3 amd_min3(uint3 v1, uint3 v2, uint3 v3) +{ + uint3 ret; + ret.x = __hsail_umin3(v1.x,v2.x, v3.x); + ret.y = __hsail_umin3(v1.y,v2.y,v3.y); + ret.z = __hsail_umin3(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint4 amd_min3(uint4 v1, uint4 v2, uint4 v3) +{ + uint4 ret; + ret.x = __hsail_umin3(v1.x,v2.x, v3.x); + ret.y = __hsail_umin3(v1.y,v2.y,v3.y); + ret.z = __hsail_umin3(v1.z,v2.z, v3.z); + ret.w = __hsail_umin3(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint8 amd_min3(uint8 v1, uint8 v2, uint8 v3) +{ + uint8 ret; + ret.s0 = __hsail_umin3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_umin3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_umin3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_umin3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_umin3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_umin3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_umin3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_umin3(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint16 amd_min3(uint16 v1, uint16 v2, uint16 v3) +{ + uint16 ret; + ret.s0 = __hsail_umin3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_umin3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_umin3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_umin3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_umin3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_umin3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_umin3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_umin3(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_umin3(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_umin3(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_umin3(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_umin3(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_umin3(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_umin3(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_umin3(v1.se,v2.se, v3.se); + ret.sf= __hsail_umin3(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) uint amd_min3(uint v1, uint v2, uint v3) +{ + return __hsail_umin3(v1,v2,v3); +} +__attribute__((overloadable,always_inline,const)) float2 amd_min3(float2 v1, float2 v2, float2 v3) +{ + float2 ret; + ret.x = __hsail_f32_min3(v1.x,v2.x, v3.x); + ret.y = __hsail_f32_min3(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) float3 amd_min3(float3 v1, float3 v2, float3 v3) +{ + float3 ret; + ret.x = __hsail_f32_min3(v1.x,v2.x, v3.x); + ret.y = __hsail_f32_min3(v1.y,v2.y,v3.y); + ret.z = __hsail_f32_min3(v1.z,v2.z, v3.z); + return ret; +} + +__attribute__((overloadable,always_inline,const)) float4 amd_min3(float4 v1, float4 v2, float4 v3) +{ + float4 ret; + ret.x = __hsail_f32_min3(v1.x,v2.x, v3.x); + ret.y = __hsail_f32_min3(v1.y,v2.y,v3.y); + ret.z = __hsail_f32_min3(v1.z,v2.z, v3.z); + ret.w = __hsail_f32_min3(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) float8 amd_min3(float8 v1, float8 v2, float8 v3) +{ + float8 ret; + ret.s0 = __hsail_f32_min3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_f32_min3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_f32_min3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_f32_min3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_f32_min3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_f32_min3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_f32_min3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_f32_min3(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) float16 amd_min3(float16 v1, float16 v2, float16 v3) +{ + float16 ret; + ret.s0 = __hsail_f32_min3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_f32_min3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_f32_min3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_f32_min3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_f32_min3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_f32_min3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_f32_min3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_f32_min3(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_f32_min3(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_f32_min3(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_f32_min3(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_f32_min3(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_f32_min3(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_f32_min3(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_f32_min3(v1.se,v2.se, v3.se); + ret.sf= __hsail_f32_min3(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) float amd_min3(float v1, float v2, float v3) +{ + return __hsail_f32_min3(v1,v2,v3); +} +__attribute__((overloadable,always_inline,const)) int2 amd_min3(int2 v1, int2 v2, int2 v3) +{ + int2 ret; + ret.x = __hsail_imin3(v1.x,v2.x, v3.x); + ret.y = __hsail_imin3(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) int3 amd_min3(int3 v1, int3 v2, int3 v3) +{ + int3 ret; + ret.x = __hsail_imin3(v1.x,v2.x, v3.x); + ret.y = __hsail_imin3(v1.y,v2.y,v3.y); + ret.z = __hsail_imin3(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) int4 amd_min3(int4 v1, int4 v2, int4 v3) +{ + int4 ret; + ret.x = __hsail_imin3(v1.x,v2.x, v3.x); + ret.y = __hsail_imin3(v1.y,v2.y,v3.y); + ret.z = __hsail_imin3(v1.z,v2.z, v3.z); + ret.w = __hsail_imin3(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) int8 amd_min3(int8 v1, int8 v2, int8 v3) +{ + int8 ret; + ret.s0 = __hsail_imin3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_imin3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_imin3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_imin3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_imin3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_imin3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_imin3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_imin3(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) int16 amd_min3(int16 v1, int16 v2, int16 v3) +{ + int16 ret; + ret.s0 = __hsail_imin3(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_imin3(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_imin3(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_imin3(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_imin3(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_imin3(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_imin3(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_imin3(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_imin3(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_imin3(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_imin3(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_imin3(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_imin3(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_imin3(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_imin3(v1.se,v2.se, v3.se); + ret.sf= __hsail_imin3(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) int amd_min3(int v1, int v2, int v3) +{ + return __hsail_imin3(v1,v2,v3); +}
diff --git a/amd-builtins/media/mqsad.cl b/amd-builtins/media/mqsad.cl new file mode 100644 index 0000000..a9b551c --- /dev/null +++ b/amd-builtins/media/mqsad.cl
@@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "media.h" + +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable + +__attribute__((overloadable,always_inline,const)) ulong2 amd_mqsad(ulong2 v1, uint2 v2, ulong2 v3) +{ + ulong2 ret; + ret.x = __hsail_mqsad(v1.x,v2.x, v3.x); + ret.y = __hsail_mqsad(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) ulong3 amd_mqsad(ulong3 v1, uint3 v2, ulong3 v3) +{ + ulong3 ret; + ret.x = __hsail_mqsad(v1.x,v2.x, v3.x); + ret.y = __hsail_mqsad(v1.y,v2.y,v3.y); + ret.z = __hsail_mqsad(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) ulong4 amd_mqsad(ulong4 v1, uint4 v2, ulong4 v3) +{ + ulong4 ret; + ret.x = __hsail_mqsad(v1.x,v2.x, v3.x); + ret.y = __hsail_mqsad(v1.y,v2.y,v3.y); + ret.z = __hsail_mqsad(v1.z,v2.z, v3.z); + ret.w = __hsail_mqsad(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) ulong8 amd_mqsad(ulong8 v1, uint8 v2, ulong8 v3) +{ + ulong8 ret; + ret.s0 = __hsail_mqsad(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_mqsad(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_mqsad(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_mqsad(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_mqsad(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_mqsad(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_mqsad(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_mqsad(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) ulong16 amd_mqsad(ulong16 v1, uint16 v2, ulong16 v3) +{ + ulong16 ret; + ret.s0 = __hsail_mqsad(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_mqsad(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_mqsad(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_mqsad(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_mqsad(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_mqsad(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_mqsad(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_mqsad(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_mqsad(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_mqsad(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_mqsad(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_mqsad(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_mqsad(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_mqsad(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_mqsad(v1.se,v2.se, v3.se); + ret.sf= __hsail_mqsad(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) ulong amd_mqsad(ulong v1, uint v2, ulong v3) +{ + return __hsail_mqsad(v1,v2,v3); +} +
diff --git a/amd-builtins/media/msad.cl b/amd-builtins/media/msad.cl new file mode 100644 index 0000000..86b4dbc --- /dev/null +++ b/amd-builtins/media/msad.cl
@@ -0,0 +1,88 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "media.h" + +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable + +__attribute__((overloadable,always_inline,const)) uint2 amd_msad(uint2 v1, uint2 v2, uint2 v3) +{ + uint2 ret; + ret.x = __hsail_msad(v1.x,v2.x, v3.x); + ret.y = __hsail_msad(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint3 amd_msad(uint3 v1, uint3 v2, uint3 v3) +{ + uint3 ret; + ret.x = __hsail_msad(v1.x,v2.x, v3.x); + ret.y = __hsail_msad(v1.y,v2.y,v3.y); + ret.z = __hsail_msad(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint4 amd_msad(uint4 v1, uint4 v2, uint4 v3) +{ + uint4 ret; + ret.x = __hsail_msad(v1.x,v2.x, v3.x); + ret.y = __hsail_msad(v1.y,v2.y,v3.y); + ret.z = __hsail_msad(v1.z,v2.z, v3.z); + ret.w = __hsail_msad(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint8 amd_msad(uint8 v1, uint8 v2, uint8 v3) +{ + uint8 ret; + ret.s0 = __hsail_msad(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_msad(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_msad(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_msad(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_msad(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_msad(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_msad(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_msad(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint16 amd_msad(uint16 v1, uint16 v2, uint16 v3) +{ + uint16 ret; + ret.s0 = __hsail_msad(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_msad(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_msad(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_msad(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_msad(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_msad(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_msad(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_msad(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_msad(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_msad(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_msad(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_msad(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_msad(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_msad(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_msad(v1.se,v2.se, v3.se); + ret.sf= __hsail_msad(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) uint amd_msad(uint v1, uint v2, uint v3) +{ + return __hsail_msad(v1,v2,v3); +}
diff --git a/amd-builtins/media/pack.cl b/amd-builtins/media/pack.cl new file mode 100644 index 0000000..11e494e --- /dev/null +++ b/amd-builtins/media/pack.cl
@@ -0,0 +1,33 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "media.h" + +#ifdef __clang__ +__attribute__((overloadable, always_inline)) +#else +__attribute__((always_inline)) +#endif + uint amd_pack(float4 v) +{ + return __hsail_packcvt_u8x4_f32(v.s0,v.s1,v.s2,v.s3); +}
diff --git a/amd-builtins/media/qsad.cl b/amd-builtins/media/qsad.cl new file mode 100644 index 0000000..096cc6f --- /dev/null +++ b/amd-builtins/media/qsad.cl
@@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "media.h" + +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable + +__attribute__((overloadable,always_inline,const)) ulong2 amd_qsad(ulong2 v1, uint2 v2, ulong2 v3) +{ + ulong2 ret; + ret.x = __hsail_qsad(v1.x,v2.x, v3.x); + ret.y = __hsail_qsad(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) ulong3 amd_qsad(ulong3 v1, uint3 v2, ulong3 v3) +{ + ulong3 ret; + ret.x = __hsail_qsad(v1.x,v2.x, v3.x); + ret.y = __hsail_qsad(v1.y,v2.y,v3.y); + ret.z = __hsail_qsad(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) ulong4 amd_qsad(ulong4 v1, uint4 v2, ulong4 v3) +{ + ulong4 ret; + ret.x = __hsail_qsad(v1.x,v2.x, v3.x); + ret.y = __hsail_qsad(v1.y,v2.y,v3.y); + ret.z = __hsail_qsad(v1.z,v2.z, v3.z); + ret.w = __hsail_qsad(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) ulong8 amd_qsad(ulong8 v1, uint8 v2, ulong8 v3) +{ + ulong8 ret; + ret.s0 = __hsail_qsad(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_qsad(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_qsad(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_qsad(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_qsad(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_qsad(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_qsad(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_qsad(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) ulong16 amd_qsad(ulong16 v1, uint16 v2, ulong16 v3) +{ + ulong16 ret; + ret.s0 = __hsail_qsad(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_qsad(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_qsad(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_qsad(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_qsad(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_qsad(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_qsad(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_qsad(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_qsad(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_qsad(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_qsad(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_qsad(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_qsad(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_qsad(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_qsad(v1.se,v2.se, v3.se); + ret.sf= __hsail_qsad(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) ulong amd_qsad(ulong v1, uint v2, ulong v3) +{ + return __hsail_qsad(v1,v2,v3); +} +
diff --git a/amd-builtins/media/sad.cl b/amd-builtins/media/sad.cl new file mode 100644 index 0000000..1f81bf4 --- /dev/null +++ b/amd-builtins/media/sad.cl
@@ -0,0 +1,100 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "media.h" + +__attribute__((overloadable, always_inline)) uint +amd_sad(uint a, uint b, uint c) +{ + return __hsail_sad_u32_u8x4(a, b, c); +} + +__attribute__((overloadable, always_inline)) uint2 +amd_sad(uint2 a, uint2 b, uint2 c) +{ + uint2 ret; + ret.x = __hsail_sad_u32_u8x4(a.x, b.x, c.x); + ret.y = __hsail_sad_u32_u8x4(a.y, b.y, c.y); + return ret; +} + +__attribute__((overloadable, always_inline)) uint3 +amd_sad(uint3 a, uint3 b, uint3 c) +{ + + uint3 ret; + ret.x = __hsail_sad_u32_u8x4(a.x, b.x, c.x); + ret.y = __hsail_sad_u32_u8x4(a.y, b.y, c.y); + ret.z = __hsail_sad_u32_u8x4(a.z, b.z, c.z); + return ret; + +} + +__attribute__((overloadable, always_inline)) uint4 +amd_sad(uint4 a, uint4 b, uint4 c) +{ + uint4 ret; + ret.x = __hsail_sad_u32_u8x4(a.x, b.x, c.x); + ret.y = __hsail_sad_u32_u8x4(a.y, b.y, c.y); + ret.z = __hsail_sad_u32_u8x4(a.z, b.z, c.z); + ret.w = __hsail_sad_u32_u8x4(a.w, b.w, c.w); + return ret; +} + +__attribute__((overloadable, always_inline)) uint8 +amd_sad(uint8 a, uint8 b, uint8 c) +{ + uint8 ret; + ret.s0 = __hsail_sad_u32_u8x4(a.s0, b.s0, c.s0); + ret.s1 = __hsail_sad_u32_u8x4(a.s1, b.s1, c.s1); + ret.s2 = __hsail_sad_u32_u8x4(a.s2, b.s2, c.s2); + ret.s3 = __hsail_sad_u32_u8x4(a.s3, b.s3, c.s3); + ret.s4 = __hsail_sad_u32_u8x4(a.s4, b.s4, c.s4); + ret.s5 = __hsail_sad_u32_u8x4(a.s5, b.s5, c.s5); + ret.s6 = __hsail_sad_u32_u8x4(a.s6, b.s6, c.s6); + ret.s7 = __hsail_sad_u32_u8x4(a.s7, b.s7, c.s7); + return ret; +} + +__attribute__((overloadable, always_inline)) uint16 +amd_sad(uint16 a, uint16 b, uint16 c) +{ + uint16 ret; + ret.s0 = __hsail_sad_u32_u8x4(a.s0, b.s0, c.s0); + ret.s1 = __hsail_sad_u32_u8x4(a.s1, b.s1, c.s1); + ret.s2 = __hsail_sad_u32_u8x4(a.s2, b.s2, c.s2); + ret.s3 = __hsail_sad_u32_u8x4(a.s3, b.s3, c.s3); + ret.s4 = __hsail_sad_u32_u8x4(a.s4, b.s4, c.s4); + ret.s5 = __hsail_sad_u32_u8x4(a.s5, b.s5, c.s5); + ret.s6 = __hsail_sad_u32_u8x4(a.s6, b.s6, c.s6); + ret.s7 = __hsail_sad_u32_u8x4(a.s7, b.s7, c.s7); + ret.s8 = __hsail_sad_u32_u8x4(a.s8, b.s8, c.s8); + ret.s9 = __hsail_sad_u32_u8x4(a.s9, b.s9, c.s9); + ret.sa = __hsail_sad_u32_u8x4(a.sa, b.sa, c.sa); + ret.sb = __hsail_sad_u32_u8x4(a.sb, b.sb, c.sb); + ret.sc = __hsail_sad_u32_u8x4(a.sc, b.sc, c.sc); + ret.sd = __hsail_sad_u32_u8x4(a.sd, b.sd, c.sd); + ret.se = __hsail_sad_u32_u8x4(a.se, b.se, c.se); + ret.sf = __hsail_sad_u32_u8x4(a.sf, b.sf, c.sf); + return ret; +} +
diff --git a/amd-builtins/media/sad4.cl b/amd-builtins/media/sad4.cl new file mode 100644 index 0000000..38a60a4 --- /dev/null +++ b/amd-builtins/media/sad4.cl
@@ -0,0 +1,37 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "media.h" +#ifdef __clang__ +__attribute__((overloadable, always_inline)) +#else +__attribute__((always_inline)) +#endif +uint amd_sad4(uint4 x, uint4 y, uint z) +{ + uint a = __hsail_sad_u32_u8x4(x.s0,y.s0,z); + a = __hsail_sad_u32_u8x4(x.s1,y.s1,a); + a = __hsail_sad_u32_u8x4(x.s2,y.s2,a); + + return __hsail_sad_u32_u8x4(x.s3,y.s3,a); +} +
diff --git a/amd-builtins/media/sadd.cl b/amd-builtins/media/sadd.cl new file mode 100644 index 0000000..36c8c05 --- /dev/null +++ b/amd-builtins/media/sadd.cl
@@ -0,0 +1,88 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "media.h" + +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable + +__attribute__((overloadable,always_inline,const)) uint2 amd_sadd(uint2 v1, uint2 v2, uint2 v3) +{ + uint2 ret; + ret.x = __hsail_sadd(v1.x,v2.x, v3.x); + ret.y = __hsail_sadd(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint3 amd_sadd(uint3 v1, uint3 v2, uint3 v3) +{ + uint3 ret; + ret.x = __hsail_sadd(v1.x,v2.x, v3.x); + ret.y = __hsail_sadd(v1.y,v2.y,v3.y); + ret.z = __hsail_sadd(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint4 amd_sadd(uint4 v1, uint4 v2, uint4 v3) +{ + uint4 ret; + ret.x = __hsail_sadd(v1.x,v2.x, v3.x); + ret.y = __hsail_sadd(v1.y,v2.y,v3.y); + ret.z = __hsail_sadd(v1.z,v2.z, v3.z); + ret.w = __hsail_sadd(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint8 amd_sadd(uint8 v1, uint8 v2, uint8 v3) +{ + uint8 ret; + ret.s0 = __hsail_sadd(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_sadd(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_sadd(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_sadd(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_sadd(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_sadd(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_sadd(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_sadd(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint16 amd_sadd(uint16 v1, uint16 v2, uint16 v3) +{ + uint16 ret; + ret.s0 = __hsail_sadd(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_sadd(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_sadd(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_sadd(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_sadd(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_sadd(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_sadd(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_sadd(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_sadd(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_sadd(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_sadd(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_sadd(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_sadd(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_sadd(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_sadd(v1.se,v2.se, v3.se); + ret.sf= __hsail_sadd(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) uint amd_sadd(uint v1, uint v2, uint v3) +{ + return __hsail_sadd(v1,v2,v3); +}
diff --git a/amd-builtins/media/sadhi.cl b/amd-builtins/media/sadhi.cl new file mode 100644 index 0000000..357e942 --- /dev/null +++ b/amd-builtins/media/sadhi.cl
@@ -0,0 +1,100 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "media.h" + +__attribute__((overloadable, always_inline)) uint +amd_sadhi(uint a, uint b, uint c) +{ + return __hsail_sadhi_u16x2_u8x4(a, b, c); +} + +__attribute__((overloadable, always_inline)) uint2 +amd_sadhi(uint2 a, uint2 b, uint2 c) +{ + uint2 ret; + ret.x = __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x); + ret.y = __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y); + return ret; +} + +__attribute__((overloadable, always_inline)) uint3 +amd_sadhi(uint3 a, uint3 b, uint3 c) +{ + + uint3 ret; + ret.x = __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x); + ret.y = __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y); + ret.z = __hsail_sadhi_u16x2_u8x4(a.z, b.z, c.z); + return ret; + +} + +__attribute__((overloadable, always_inline)) uint4 +amd_sadhi(uint4 a, uint4 b, uint4 c) +{ + uint4 ret; + ret.x = __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x); + ret.y = __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y); + ret.z = __hsail_sadhi_u16x2_u8x4(a.z, b.z, c.z); + ret.w = __hsail_sadhi_u16x2_u8x4(a.w, b.w, c.w); + return ret; +} + +__attribute__((overloadable, always_inline)) uint8 +amd_sadhi(uint8 a, uint8 b, uint8 c) +{ + uint8 ret; + ret.s0 = __hsail_sadhi_u16x2_u8x4(a.s0, b.s0, c.s0); + ret.s1 = __hsail_sadhi_u16x2_u8x4(a.s1, b.s1, c.s1); + ret.s2 = __hsail_sadhi_u16x2_u8x4(a.s2, b.s2, c.s2); + ret.s3 = __hsail_sadhi_u16x2_u8x4(a.s3, b.s3, c.s3); + ret.s4 = __hsail_sadhi_u16x2_u8x4(a.s4, b.s4, c.s4); + ret.s5 = __hsail_sadhi_u16x2_u8x4(a.s5, b.s5, c.s5); + ret.s6 = __hsail_sadhi_u16x2_u8x4(a.s6, b.s6, c.s6); + ret.s7 = __hsail_sadhi_u16x2_u8x4(a.s7, b.s7, c.s7); + return ret; +} + +__attribute__((overloadable, always_inline)) uint16 +amd_sadhi(uint16 a, uint16 b, uint16 c) +{ + uint16 ret; + ret.s0 = __hsail_sadhi_u16x2_u8x4(a.s0, b.s0, c.s0); + ret.s1 = __hsail_sadhi_u16x2_u8x4(a.s1, b.s1, c.s1); + ret.s2 = __hsail_sadhi_u16x2_u8x4(a.s2, b.s2, c.s2); + ret.s3 = __hsail_sadhi_u16x2_u8x4(a.s3, b.s3, c.s3); + ret.s4 = __hsail_sadhi_u16x2_u8x4(a.s4, b.s4, c.s4); + ret.s5 = __hsail_sadhi_u16x2_u8x4(a.s5, b.s5, c.s5); + ret.s6 = __hsail_sadhi_u16x2_u8x4(a.s6, b.s6, c.s6); + ret.s7 = __hsail_sadhi_u16x2_u8x4(a.s7, b.s7, c.s7); + ret.s8 = __hsail_sadhi_u16x2_u8x4(a.s8, b.s8, c.s8); + ret.s9 = __hsail_sadhi_u16x2_u8x4(a.s9, b.s9, c.s9); + ret.sa = __hsail_sadhi_u16x2_u8x4(a.sa, b.sa, c.sa); + ret.sb = __hsail_sadhi_u16x2_u8x4(a.sb, b.sb, c.sb); + ret.sc = __hsail_sadhi_u16x2_u8x4(a.sc, b.sc, c.sc); + ret.sd = __hsail_sadhi_u16x2_u8x4(a.sd, b.sd, c.sd); + ret.se = __hsail_sadhi_u16x2_u8x4(a.se, b.se, c.se); + ret.sf = __hsail_sadhi_u16x2_u8x4(a.sf, b.sf, c.sf); + return ret; +} +
diff --git a/amd-builtins/media/sadw.cl b/amd-builtins/media/sadw.cl new file mode 100644 index 0000000..3d13b7a --- /dev/null +++ b/amd-builtins/media/sadw.cl
@@ -0,0 +1,88 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "media.h" + +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable + +__attribute__((overloadable,always_inline,const)) uint2 amd_sadw(uint2 v1, uint2 v2, uint2 v3) +{ + uint2 ret; + ret.x = __hsail_sadw(v1.x,v2.x, v3.x); + ret.y = __hsail_sadw(v1.y,v2.y,v3.y); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint3 amd_sadw(uint3 v1, uint3 v2, uint3 v3) +{ + uint3 ret; + ret.x = __hsail_sadw(v1.x,v2.x, v3.x); + ret.y = __hsail_sadw(v1.y,v2.y,v3.y); + ret.z = __hsail_sadw(v1.z,v2.z, v3.z); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint4 amd_sadw(uint4 v1, uint4 v2, uint4 v3) +{ + uint4 ret; + ret.x = __hsail_sadw(v1.x,v2.x, v3.x); + ret.y = __hsail_sadw(v1.y,v2.y,v3.y); + ret.z = __hsail_sadw(v1.z,v2.z, v3.z); + ret.w = __hsail_sadw(v1.w,v2.w,v3.w); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint8 amd_sadw(uint8 v1, uint8 v2, uint8 v3) +{ + uint8 ret; + ret.s0 = __hsail_sadw(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_sadw(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_sadw(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_sadw(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_sadw(v1.s4,v2.s4,v3.s4 ); + ret.s5 = __hsail_sadw(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_sadw(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_sadw(v1.s7,v2.s7,v3.s7); + return ret; +} +__attribute__((overloadable,always_inline,const)) uint16 amd_sadw(uint16 v1, uint16 v2, uint16 v3) +{ + uint16 ret; + ret.s0 = __hsail_sadw(v1.s0,v2.s0, v3.s0); + ret.s1 = __hsail_sadw(v1.s1,v2.s1,v3.s1); + ret.s2 = __hsail_sadw(v1.s2,v2.s2, v3.s2); + ret.s3 = __hsail_sadw(v1.s3,v2.s3,v3.s3); + ret.s4 = __hsail_sadw(v1.s4,v2.s4,v3.s4) ; + ret.s5 = __hsail_sadw(v1.s5,v2.s5,v3.s5); + ret.s6 = __hsail_sadw(v1.s6,v2.s6,v3.s6 ); + ret.s7 = __hsail_sadw(v1.s7,v2.s7,v3.s7); + ret.s8 = __hsail_sadw(v1.s8,v2.s8,v3.s8 ); + ret.s9 = __hsail_sadw(v1.s9,v2.s9,v3.s9); + ret.sa = __hsail_sadw(v1.sa,v2.sa, v3.sa); + ret.sb = __hsail_sadw(v1.sb,v2.sb,v3.sb); + ret.sc = __hsail_sadw(v1.sc,v2.sc, v3.sc); + ret.sd = __hsail_sadw(v1.sd,v2.sd,v3.sd); + ret.se = __hsail_sadw(v1.se,v2.se, v3.se); + ret.sf= __hsail_sadw(v1.sf,v2.sf,v3.sf); + + return ret; +} +__attribute__((overloadable,always_inline,const)) uint amd_sadw(uint v1, uint v2, uint v3) +{ + return __hsail_sadw(v1,v2,v3); +}
diff --git a/amd-builtins/media/unpack.cl b/amd-builtins/media/unpack.cl new file mode 100644 index 0000000..96a35b2 --- /dev/null +++ b/amd-builtins/media/unpack.cl
@@ -0,0 +1,327 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "media.h" + +__attribute__((overloadable, always_inline)) float +amd_unpack0(uint a) +{ + return __hsail_unpackcvt_f32_u8x4(a,0); +} + +__attribute__((overloadable, always_inline)) float2 +amd_unpack0(uint2 a) +{ + float2 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,0); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,0); + return ret; +} + +__attribute__((overloadable, always_inline)) float3 +amd_unpack0(uint3 a) +{ + + float3 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,0); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,0); + ret.z = __hsail_unpackcvt_f32_u8x4(a.z,0); + return ret; + +} + +__attribute__((overloadable, always_inline)) float4 +amd_unpack0(uint4 a) +{ + float4 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,0); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,0); + ret.z = __hsail_unpackcvt_f32_u8x4(a.z,0); + ret.w = __hsail_unpackcvt_f32_u8x4(a.w,0); + return ret; +} + +__attribute__((overloadable, always_inline)) float8 +amd_unpack0(uint8 a) +{ + float8 ret; + ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,0); + ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,0); + ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,0); + ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,0); + ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,0); + ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,0); + ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,0); + ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,0); + return ret; +} + +__attribute__((overloadable, always_inline)) float16 +amd_unpack0(uint16 a) +{ + float16 ret; + ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,0); + ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,0); + ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,0); + ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,0); + ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,0); + ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,0); + ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,0); + ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,0); + ret.s8 = __hsail_unpackcvt_f32_u8x4(a.s8,0); + ret.s9 = __hsail_unpackcvt_f32_u8x4(a.s9,0); + ret.sa = __hsail_unpackcvt_f32_u8x4(a.sa,0); + ret.sb = __hsail_unpackcvt_f32_u8x4(a.sb,0); + ret.sc = __hsail_unpackcvt_f32_u8x4(a.sc,0); + ret.sd = __hsail_unpackcvt_f32_u8x4(a.sd,0); + ret.se = __hsail_unpackcvt_f32_u8x4(a.se,0); + ret.sf = __hsail_unpackcvt_f32_u8x4(a.sf,0); + return ret; +} + +__attribute__((overloadable, always_inline)) float +amd_unpack1(uint a) +{ + return __hsail_unpackcvt_f32_u8x4(a,1); +} + +__attribute__((overloadable, always_inline)) float2 +amd_unpack1(uint2 a) +{ + float2 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,1); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,1); + return ret; +} + +__attribute__((overloadable, always_inline)) float3 +amd_unpack1(uint3 a) +{ + + float3 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,1); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,1); + ret.z = __hsail_unpackcvt_f32_u8x4(a.z,1); + return ret; + +} + +__attribute__((overloadable, always_inline)) float4 +amd_unpack1(uint4 a) +{ + float4 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,1); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,1); + ret.z = __hsail_unpackcvt_f32_u8x4(a.z,1); + ret.w = __hsail_unpackcvt_f32_u8x4(a.w,1); + return ret; +} + +__attribute__((overloadable, always_inline)) float8 +amd_unpack1(uint8 a) +{ + float8 ret; + ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,1); + ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,1); + ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,1); + ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,1); + ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,1); + ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,1); + ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,1); + ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,1); + return ret; +} + +__attribute__((overloadable, always_inline)) float16 +amd_unpack1(uint16 a) +{ + float16 ret; + ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,1); + ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,1); + ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,1); + ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,1); + ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,1); + ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,1); + ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,1); + ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,1); + ret.s8 = __hsail_unpackcvt_f32_u8x4(a.s8,1); + ret.s9 = __hsail_unpackcvt_f32_u8x4(a.s9,1); + ret.sa = __hsail_unpackcvt_f32_u8x4(a.sa,1); + ret.sb = __hsail_unpackcvt_f32_u8x4(a.sb,1); + ret.sc = __hsail_unpackcvt_f32_u8x4(a.sc,1); + ret.sd = __hsail_unpackcvt_f32_u8x4(a.sd,1); + ret.se = __hsail_unpackcvt_f32_u8x4(a.se,1); + ret.sf = __hsail_unpackcvt_f32_u8x4(a.sf,1); + return ret; +} + +__attribute__((overloadable, always_inline)) float +amd_unpack2(uint a) +{ + return __hsail_unpackcvt_f32_u8x4(a,2); +} + +__attribute__((overloadable, always_inline)) float2 +amd_unpack2(uint2 a) +{ + float2 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,2); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,2); + return ret; +} + +__attribute__((overloadable, always_inline)) float3 +amd_unpack2(uint3 a) +{ + + float3 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,2); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,2); + ret.z = __hsail_unpackcvt_f32_u8x4(a.z,2); + return ret; + +} + +__attribute__((overloadable, always_inline)) float4 +amd_unpack2(uint4 a) +{ + float4 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,2); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,2); + ret.z = __hsail_unpackcvt_f32_u8x4(a.z,2); + ret.w = __hsail_unpackcvt_f32_u8x4(a.w,2); + return ret; +} + +__attribute__((overloadable, always_inline)) float8 +amd_unpack2(uint8 a) +{ + float8 ret; + ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,2); + ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,2); + ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,2); + ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,2); + ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,2); + ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,2); + ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,2); + ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,2); + return ret; +} + +__attribute__((overloadable, always_inline)) float16 +amd_unpack2(uint16 a) +{ + float16 ret; + ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,2); + ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,2); + ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,2); + ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,2); + ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,2); + ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,2); + ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,2); + ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,2); + ret.s8 = __hsail_unpackcvt_f32_u8x4(a.s8,2); + ret.s9 = __hsail_unpackcvt_f32_u8x4(a.s9,2); + ret.sa = __hsail_unpackcvt_f32_u8x4(a.sa,2); + ret.sb = __hsail_unpackcvt_f32_u8x4(a.sb,2); + ret.sc = __hsail_unpackcvt_f32_u8x4(a.sc,2); + ret.sd = __hsail_unpackcvt_f32_u8x4(a.sd,2); + ret.se = __hsail_unpackcvt_f32_u8x4(a.se,2); + ret.sf = __hsail_unpackcvt_f32_u8x4(a.sf,2); + return ret; +} + +__attribute__((overloadable, always_inline)) float +amd_unpack3(uint a) +{ + return __hsail_unpackcvt_f32_u8x4(a,3); +} + +__attribute__((overloadable, always_inline)) float2 +amd_unpack3(uint2 a) +{ + float2 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,3); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,3); + return ret; +} + +__attribute__((overloadable, always_inline)) float3 +amd_unpack3(uint3 a) +{ + + float3 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,3); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,3); + ret.z = __hsail_unpackcvt_f32_u8x4(a.z,3); + return ret; + +} + +__attribute__((overloadable, always_inline)) float4 +amd_unpack3(uint4 a) +{ + float4 ret; + ret.x = __hsail_unpackcvt_f32_u8x4(a.x,3); + ret.y = __hsail_unpackcvt_f32_u8x4(a.y,3); + ret.z = __hsail_unpackcvt_f32_u8x4(a.z,3); + ret.w = __hsail_unpackcvt_f32_u8x4(a.w,3); + return ret; +} + +__attribute__((overloadable, always_inline)) float8 +amd_unpack3(uint8 a) +{ + float8 ret; + ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,3); + ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,3); + ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,3); + ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,3); + ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,3); + ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,3); + ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,3); + ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,3); + return ret; +} + +__attribute__((overloadable, always_inline)) float16 +amd_unpack3(uint16 a) +{ + float16 ret; + ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,3); + ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,3); + ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,3); + ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,3); + ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,3); + ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,3); + ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,3); + ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,3); + ret.s8 = __hsail_unpackcvt_f32_u8x4(a.s8,3); + ret.s9 = __hsail_unpackcvt_f32_u8x4(a.s9,3); + ret.sa = __hsail_unpackcvt_f32_u8x4(a.sa,3); + ret.sb = __hsail_unpackcvt_f32_u8x4(a.sb,3); + ret.sc = __hsail_unpackcvt_f32_u8x4(a.sc,3); + ret.sd = __hsail_unpackcvt_f32_u8x4(a.sd,3); + ret.se = __hsail_unpackcvt_f32_u8x4(a.se,3); + ret.sf = __hsail_unpackcvt_f32_u8x4(a.sf,3); + return ret; +}
diff --git a/amd-builtins/misc/amdil-to-hsail.cl b/amd-builtins/misc/amdil-to-hsail.cl new file mode 100644 index 0000000..43d3922 --- /dev/null +++ b/amd-builtins/misc/amdil-to-hsail.cl
@@ -0,0 +1,352 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +// __amdil_ to __hsail_ translation + +// HSAIL intrinsic functions used by math32 functions +extern __attribute__((pure)) float __hsail_fma_f32(float, float, float); +extern __attribute__((pure)) float __hsail_nfma_f32(float, float, float); +extern __attribute__((pure)) float __hsail_min_f32(float, float); +extern __attribute__((pure)) float __hsail_max_f32(float, float); +extern __attribute__((pure)) float __hsail_ftz_f32(float); +extern __attribute__((pure)) float __hsail_round_f32(float); +extern __attribute__((pure)) float __hsail_floor_f32(float); +extern __attribute__((pure)) float __hsail_ceil_f32(float); +extern __attribute__((pure)) float __hsail_trunc_f32(float); +extern __attribute__((pure)) float __hsail_abs_f32(float); + +extern __attribute__((pure)) int __hsail_min_s32(int, int); +extern __attribute__((pure)) int __hsail_max_s32(int, int); +extern __attribute__((pure)) uint __hsail_min_u32(uint, int); +extern __attribute__((pure)) uint __hsail_max_u32(uint, uint); +extern __attribute__((pure)) int __hsail_mulhi_s32(int, int); +extern __attribute__((pure)) uint __hsail_mulhi_u32(uint, uint); +extern __attribute__((pure)) int __hsail_mulhi_s64(int, int); +extern __attribute__((pure)) uint __hsail_mulhi_u64(uint, uint); + +// HSAIL intrinsic functions used by math64 functions +extern __attribute__((pure)) double __hsail_fma_f64(double, double, double); +extern __attribute__((pure)) double __hsail_nfma_f64(double, double, double); +extern __attribute__((pure)) double __hsail_max_f64(double, double); +extern __attribute__((pure)) double __hsail_min_f64(double, double); +extern __attribute__((pure)) double __hsail_round_f64(double); +extern __attribute__((pure)) double __hsail_floor_f64(double); +extern __attribute__((pure)) double __hsail_ceil_f64(double); +extern __attribute__((pure)) double __hsail_trunc_f64(double); +extern __attribute__((pure)) double __hsail_abs_f64(double); +extern __attribute__((pure)) double __hsail_nrsqrt_f64(double); +extern __attribute__((pure)) double __hsail_nsqrt_f64(double); + +extern __attribute__((pure)) uint __hsail_mad_u32(uint, uint, uint); + +// HSAIL conversion intrinsics +extern __attribute__((pure)) float __cvt_f32_f16(uint op1); + +extern __attribute__((pure)) float __cvt_f16_rtz_f32(float op1); +extern __attribute__((pure)) float __cvt_f16_rte_f32(float op1); +extern __attribute__((pure)) float __cvt_f16_rtn_f32(float op1); +extern __attribute__((pure)) float __cvt_f16_rtp_f32(float op1); + +extern __attribute__((pure)) float __cvt_f16_rtz_f64(double op1); +extern __attribute__((pure)) float __cvt_f16_rte_f64(double op1); +extern __attribute__((pure)) float __cvt_f16_rtn_f64(double op1); +extern __attribute__((pure)) float __cvt_f16_rtp_f64(double op1); + +// Misc HSAIL intrinsic functions +extern __attribute__((const)) uint __hsail_bitselect_u32(uint, uint, uint); +extern __attribute__((pure)) int __hsail_class_f32(float, int); +extern __attribute__((pure)) int __hsail_class_f64(double, int); +extern __attribute__((pure)) int __hsail_mad24_s32(int, int, int); +extern __attribute__((pure)) uint __hsail_mad24_u32(uint, uint, uint); +extern __attribute__((pure)) int __hsail_mul24_s32(int, int); +extern __attribute__((pure)) uint __hsail_mul24_u32(uint, uint); + +extern __attribute__((pure)) int __hsail_popcount_u32_b32(int); + +extern __attribute__((pure)) int __hsail_firstbit_u32(uint); + +extern __attribute__((pure)) float __hsail_fraction_f32(float); +extern __attribute__((pure)) double __hsail_fraction_f64(double); + +// __amdil_ math32 function defs + +__attribute__((weak,always_inline)) float +__amdil_div_f32(float x, float y) { + return native_divide(x, y); +} + +__attribute__((weak,always_inline)) float +__amdil_fma_f32(float x, float y, float z) { + return __hsail_fma_f32(x, y, z); +} + +__attribute__((weak,always_inline)) float +__amdil_mad_f32(float x, float y, float z) { + return __hsail_nfma_f32(x, y, z); +} + +__attribute__((weak,always_inline)) float +__amdil_min_f32(float x, float y) { + return __hsail_min_f32(x, y); +} + +__attribute__((weak,always_inline)) float +__amdil_max_f32(float x, float y) { + return __hsail_max_f32(x, y); +} + +__attribute__((weak,always_inline)) float +__ftz_f32(float x) { + return __hsail_ftz_f32(x); +} + +__attribute__((weak,always_inline)) float +__amdil_round_nearest_f32(float x) { + return __hsail_round_f32(x); +} + +__attribute__((weak,always_inline)) float +__amdil_round_neginf_f32(float x) { + return __hsail_floor_f32(x); +} + +__attribute__((weak,always_inline)) float +__amdil_round_posinf_f32(float x) { + return __hsail_ceil_f32(x); +} + +__attribute__((weak,always_inline)) float +__amdil_round_zero_f32(float x) { + return __hsail_trunc_f32(x); +} + +__attribute__((weak,always_inline)) float +__amdil_fabs_f32(float x) { + return __hsail_abs_f32(x); +} + +__attribute__((weak,always_inline)) float +__amdil_improved_div_f32(float x, float y) { + return native_divide(x, y); +} + +__attribute__((weak,always_inline)) int +__amdil_imin_i32(int x, int y) { + return __hsail_min_s32(x, y); +} + +__attribute__((weak,always_inline)) int +__amdil_imax_i32(int x, int y) { + return __hsail_max_s32(x, y); +} + +__attribute__((weak,always_inline)) uint +__amdil_umin_u32(uint x, uint y) { + return __hsail_min_u32(x, y); +} + +__attribute__((weak,always_inline)) uint +__amdil_umax_u32(uint x, uint y) { + return __hsail_max_u32(x, y); +} + +__attribute__((weak,always_inline)) int +__amdil_imul_high_i32(int x, int y) { + return __hsail_mulhi_s32(x, y); +} + +__attribute__((weak,always_inline)) uint +__amdil_umul_high_u32(uint x, uint y) { + return __hsail_mulhi_u32(x, y); +} + +__attribute__((weak,always_inline)) uint +__amdil_umad_u32(uint x, uint y, uint z) { + return __hsail_mad_u32(x, y, z); +} + +// __amdil_ math64 function defs + +__attribute__((weak,always_inline)) double +__amdil_fma_f64(double x, double y, double z) { + return __hsail_fma_f64(x, y, z); +} + +__attribute__((weak,always_inline)) double +__amdil_mad_f64(double x, double y, double z) { + return __hsail_nfma_f64(x, y, z); +} + + __attribute__((weak,always_inline)) double +__amdil_max_f64(double x, double y) { + return __hsail_max_f64(x, y); +} + +__attribute__((weak,always_inline)) double +__amdil_round_nearest_f64(double x) { + return __hsail_round_f64(x); +} + +__attribute__((weak,always_inline)) double +__amdil_round_neginf_f64(double x) { + return __hsail_floor_f64(x); +} + +__attribute__((weak,always_inline)) double +__amdil_round_posinf_f64(double x) { + return __hsail_ceil_f64(x); +} + +__attribute__((weak,always_inline)) double +__amdil_round_zero_f64(double x) { + return __hsail_trunc_f64(x); +} + + __attribute__((weak,always_inline)) double +__amdil_min_f64(double x, double y) { + return __hsail_min_f64(x, y); +} + +__attribute__((weak,always_inline)) double +__amdil_fabs_f64(double x) { + return __hsail_abs_f64(x); +} + +__attribute__((weak,always_inline)) double +__amdil_sqrt_f64(double x) { + return __hsail_nsqrt_f64(x); +} + +__attribute__((weak,always_inline)) double +__amdil_rsq_f64(double x) { + return __hsail_nrsqrt_f64(x); +} + +// __amdil conversion functions + +__attribute__((weak,always_inline)) float +__amdil_half_to_float_f32(uint x) { + return __cvt_f32_f16(x); +} + +__attribute__((weak,always_inline)) float +__amdil_float_to_half_f32(float x) { + return __cvt_f16_rtz_f32(x); +} + +__attribute__((weak,always_inline)) float +__amdil_float_to_half_near_f32(float x) { + return __cvt_f16_rte_f32(x); +} + +__attribute__((weak,always_inline)) float +__amdil_float_to_half_neg_inf_f32(float x) { + return __cvt_f16_rtn_f32(x); +} + +__attribute__((weak,always_inline)) float +__amdil_float_to_half_plus_inf_f32(float x) { + return __cvt_f16_rtp_f32(x); +} + +__attribute__((weak,always_inline)) float +__amdil_double_to_half_f64(double x) { + return __cvt_f16_rtz_f64(x); +} + +__attribute__((weak,always_inline)) float +__amdil_double_to_half_near_f64(double x) { + return __cvt_f16_rte_f64(x); +} + +__attribute__((weak,always_inline)) float +__amdil_double_to_half_neg_inf_f64(double x) { + return __cvt_f16_rtn_f64(x); +} + +__attribute__((weak,always_inline)) float +__amdil_double_to_half_plus_inf_f64(double x) { + return __cvt_f16_rtp_f64(x); +} + +// Misc __amdil_ function defs + +__attribute__((weak,always_inline)) uint +__amdil_bfi_u32(uint x, uint y, uint z) { + return __hsail_bitselect_u32(x, y, z); +} + +__attribute__((weak,always_inline)) int +__amdil_class_f32(float x, int y) { + int cval = __hsail_class_f32(x, y); + int ret = (cval & 0x1) ? (0xffffffffU) : 0; + return ret; +} + +__attribute__((weak,always_inline)) int +__amdil_class_f64(double x, int y) { + int cval = __hsail_class_f64(x, y); + int ret = (cval & 0x1) ? (0xffffffffU) : 0; + return ret; +} + +__attribute__((weak,always_inline)) int +__amdil_imad24_i32(int x, int y, int z) { + return __hsail_mad24_s32(x, y, z); +} + +__attribute__((weak,always_inline)) uint +__amdil_umad24_u32(uint x, uint y, uint z) { + return __hsail_mad24_u32(x, y, z); +} + +__attribute__((weak,always_inline)) int +__amdil_imul24_i32(int x, int y) { + return __hsail_mul24_s32(x, y); +} + +__attribute__((weak,always_inline)) uint +__amdil_umul24_u32(uint x, uint y) { + return __hsail_mul24_u32(x, y); +} + +__attribute__((weak,always_inline)) int +__amdil_count_bits_i32(int x) { + return __hsail_popcount_u32_b32(x); +} + +__attribute__((weak,always_inline)) int +__amdil_ffb_hi_u32(uint x) { + return __hsail_firstbit_u32(x); +} + +//#ifdef HSAIL_SPEC_CURRENT +__attribute__((weak,always_inline)) float +__amdil_fraction_f32(float x) { + return __hsail_fraction_f32(x); +} + +__attribute__((weak,always_inline)) double +__amdil_fraction_f64(double x) { + return __hsail_fraction_f64(x); +} +//#endif +
diff --git a/amd-builtins/misc/atomicWorkItemFence.cl b/amd-builtins/misc/atomicWorkItemFence.cl new file mode 100644 index 0000000..6ea86a0 --- /dev/null +++ b/amd-builtins/misc/atomicWorkItemFence.cl
@@ -0,0 +1,100 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#if __OPENCL_C_VERSION__ >= 200 +extern void __atomic_memfence(uint flags, uint mo, uint msc); +enum BrigMemoryFenceSegments { + BRIG_MEMORY_FENCE_NONE = 0, + BRIG_MEMORY_FENCE_GROUP = 1, + BRIG_MEMORY_FENCE_GLOBAL = 2, + BRIG_MEMORY_FENCE_BOTH = 3, + BRIG_MEMORY_FENCE_IMAGE = 4 +}; + +enum BrigMemoryOrder { + BRIG_MEMORY_ORDER_NONE = 0, + BRIG_MEMORY_ORDER_RELAXED = 1, + BRIG_MEMORY_ORDER_ACQUIRE = 2, + BRIG_MEMORY_ORDER_RELEASE = 3, + BRIG_MEMORY_ORDER_ACQUIRE_RELEASE = 4 +}; + +enum BrigMemoryScope { + BRIG_MEMORY_SCOPE_NONE = 0, + BRIG_MEMORY_SCOPE_WAVEFRONT = 1, + BRIG_MEMORY_SCOPE_WORKGROUP = 2, + BRIG_MEMORY_SCOPE_COMPONENT = 3, + BRIG_MEMORY_SCOPE_SYSTEM = 4, + BRIG_MEMORY_SCOPE_WORKITEM = 5 +}; + +static inline uint getBrigMemoryOrder(memory_order mo) { + switch(mo) { + default : return BRIG_MEMORY_ORDER_NONE; + case memory_order_relaxed : return BRIG_MEMORY_ORDER_RELAXED; + case memory_order_release : return BRIG_MEMORY_ORDER_RELEASE; + case memory_order_acquire : return BRIG_MEMORY_ORDER_ACQUIRE; + case memory_order_acq_rel : + case memory_order_seq_cst : return BRIG_MEMORY_ORDER_ACQUIRE_RELEASE; + } +} + +static inline uint getBrigMemoryScope(memory_scope msc) { + switch(msc) { + default : return BRIG_MEMORY_SCOPE_NONE; + case memory_scope_work_group : return BRIG_MEMORY_SCOPE_WORKGROUP; + case memory_scope_device : return BRIG_MEMORY_SCOPE_COMPONENT; + case memory_scope_all_svm_devices : return BRIG_MEMORY_SCOPE_SYSTEM; + case memory_scope_sub_group : return BRIG_MEMORY_SCOPE_WAVEFRONT; + case memory_scope_work_item : return BRIG_MEMORY_SCOPE_WORKITEM; + } +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) void +atomic_work_item_fence(/*cl_mem_fence_flags*/ unsigned flag, memory_order mo, memory_scope msc) { + uint brigSegment = 0; + uint brigMemoryOrder = getBrigMemoryOrder(mo); + uint brigMemoryScope = BRIG_MEMORY_SCOPE_WORKGROUP; + // relaxed fence has no effect + if (mo == memory_order_relaxed) return; + if ((flag & CLK_GLOBAL_MEM_FENCE) && (flag & CLK_LOCAL_MEM_FENCE)) { + brigSegment = BRIG_MEMORY_FENCE_BOTH; + brigMemoryScope = getBrigMemoryScope(msc); + } + else if (flag & CLK_GLOBAL_MEM_FENCE) { + brigSegment = BRIG_MEMORY_FENCE_GLOBAL; + brigMemoryScope = getBrigMemoryScope(msc); + } + else if (flag & CLK_LOCAL_MEM_FENCE) { + brigSegment = BRIG_MEMORY_FENCE_GROUP; + } + if (brigSegment != 0) { + __atomic_memfence(brigSegment, brigMemoryOrder, brigMemoryScope); + } + if (flag & CLK_IMAGE_MEM_FENCE) { + brigMemoryScope = getBrigMemoryScope(msc); + __atomic_memfence(BRIG_MEMORY_FENCE_IMAGE, BRIG_MEMORY_ORDER_ACQUIRE_RELEASE, brigMemoryScope); + } +} +#endif // __OPENCL_C_VERSION__ >= 200
diff --git a/amd-builtins/misc/awgcpy.cl b/amd-builtins/misc/awgcpy.cl new file mode 100644 index 0000000..6a5f302 --- /dev/null +++ b/amd-builtins/misc/awgcpy.cl
@@ -0,0 +1,2696 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +extern __attribute__((pure)) int __hsail_workitemid_flat(void); + +__attribute__((always_inline)) static event_t +__AWGClgI1(__local uchar * dst, const __global uchar * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClgI1"))) event_t async_work_group_copy(__local uchar *, const __global uchar *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClgI1"))) event_t async_work_group_copy(__local char *, const __global char *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClgI1(__local uchar *dst, const __global uchar *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClgI1"))) event_t async_work_group_strided_copy(__local uchar *, const __global uchar *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClgI1"))) event_t async_work_group_strided_copy(__local char *, const __global char *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCglI1(__global uchar * dst, const __local uchar * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCglI1"))) event_t async_work_group_copy(__global uchar *, const __local uchar *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCglI1"))) event_t async_work_group_copy(__global char *, const __local char *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCglI1(__global uchar *dst, const __local uchar *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCglI1"))) event_t async_work_group_strided_copy(__global uchar *, const __local uchar *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCglI1"))) event_t async_work_group_strided_copy(__global char *, const __local char *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uchar *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global char *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClgI2(__local ushort * dst, const __global ushort * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClgI2"))) event_t async_work_group_copy(__local ushort *, const __global ushort *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClgI2"))) event_t async_work_group_copy(__local short *, const __global short *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClgI2(__local ushort *dst, const __global ushort *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClgI2"))) event_t async_work_group_strided_copy(__local ushort *, const __global ushort *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClgI2"))) event_t async_work_group_strided_copy(__local short *, const __global short *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCglI2(__global ushort * dst, const __local ushort * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCglI2"))) event_t async_work_group_copy(__global ushort *, const __local ushort *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCglI2"))) event_t async_work_group_copy(__global short *, const __local short *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCglI2(__global ushort *dst, const __local ushort *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCglI2"))) event_t async_work_group_strided_copy(__global ushort *, const __local ushort *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCglI2"))) event_t async_work_group_strided_copy(__global short *, const __local short *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ushort *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global short *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClgI4(__local uint * dst, const __global uint * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClgI4"))) event_t async_work_group_copy(__local uint *, const __global uint *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClgI4"))) event_t async_work_group_copy(__local int *, const __global int *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClgI4(__local uint *dst, const __global uint *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClgI4"))) event_t async_work_group_strided_copy(__local uint *, const __global uint *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClgI4"))) event_t async_work_group_strided_copy(__local int *, const __global int *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCglI4(__global uint * dst, const __local uint * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCglI4"))) event_t async_work_group_copy(__global uint *, const __local uint *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCglI4"))) event_t async_work_group_copy(__global int *, const __local int *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCglI4(__global uint *dst, const __local uint *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCglI4"))) event_t async_work_group_strided_copy(__global uint *, const __local uint *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCglI4"))) event_t async_work_group_strided_copy(__global int *, const __local int *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uint *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global int *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClgI8(__local ulong * dst, const __global ulong * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClgI8"))) event_t async_work_group_copy(__local ulong *, const __global ulong *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClgI8"))) event_t async_work_group_copy(__local long *, const __global long *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClgI8(__local ulong *dst, const __global ulong *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClgI8"))) event_t async_work_group_strided_copy(__local ulong *, const __global ulong *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClgI8"))) event_t async_work_group_strided_copy(__local long *, const __global long *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCglI8(__global ulong * dst, const __local ulong * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCglI8"))) event_t async_work_group_copy(__global ulong *, const __local ulong *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCglI8"))) event_t async_work_group_copy(__global long *, const __local long *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCglI8(__global ulong *dst, const __local ulong *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCglI8"))) event_t async_work_group_strided_copy(__global ulong *, const __local ulong *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCglI8"))) event_t async_work_group_strided_copy(__global long *, const __local long *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ulong *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global long *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local float * dst, const __global float * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local float *dst, const __global float *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global float * dst, const __local float * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global float *dst, const __local float *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global float *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local double * dst, const __global double * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local double *dst, const __global double *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global double * dst, const __local double * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global double *dst, const __local double *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global double *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg2I1(__local uchar2 * dst, const __global uchar2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg2I1"))) event_t async_work_group_copy(__local uchar2 *, const __global uchar2 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg2I1"))) event_t async_work_group_copy(__local char2 *, const __global char2 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg2I1(__local uchar2 *dst, const __global uchar2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg2I1"))) event_t async_work_group_strided_copy(__local uchar2 *, const __global uchar2 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg2I1"))) event_t async_work_group_strided_copy(__local char2 *, const __global char2 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl2I1(__global uchar2 * dst, const __local uchar2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl2I1"))) event_t async_work_group_copy(__global uchar2 *, const __local uchar2 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl2I1"))) event_t async_work_group_copy(__global char2 *, const __local char2 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl2I1(__global uchar2 *dst, const __local uchar2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl2I1"))) event_t async_work_group_strided_copy(__global uchar2 *, const __local uchar2 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl2I1"))) event_t async_work_group_strided_copy(__global char2 *, const __local char2 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uchar2 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global char2 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg2I2(__local ushort2 * dst, const __global ushort2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg2I2"))) event_t async_work_group_copy(__local ushort2 *, const __global ushort2 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg2I2"))) event_t async_work_group_copy(__local short2 *, const __global short2 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg2I2(__local ushort2 *dst, const __global ushort2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg2I2"))) event_t async_work_group_strided_copy(__local ushort2 *, const __global ushort2 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg2I2"))) event_t async_work_group_strided_copy(__local short2 *, const __global short2 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl2I2(__global ushort2 * dst, const __local ushort2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl2I2"))) event_t async_work_group_copy(__global ushort2 *, const __local ushort2 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl2I2"))) event_t async_work_group_copy(__global short2 *, const __local short2 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl2I2(__global ushort2 *dst, const __local ushort2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl2I2"))) event_t async_work_group_strided_copy(__global ushort2 *, const __local ushort2 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl2I2"))) event_t async_work_group_strided_copy(__global short2 *, const __local short2 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ushort2 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global short2 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg2I4(__local uint2 * dst, const __global uint2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg2I4"))) event_t async_work_group_copy(__local uint2 *, const __global uint2 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg2I4"))) event_t async_work_group_copy(__local int2 *, const __global int2 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg2I4(__local uint2 *dst, const __global uint2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg2I4"))) event_t async_work_group_strided_copy(__local uint2 *, const __global uint2 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg2I4"))) event_t async_work_group_strided_copy(__local int2 *, const __global int2 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl2I4(__global uint2 * dst, const __local uint2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl2I4"))) event_t async_work_group_copy(__global uint2 *, const __local uint2 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl2I4"))) event_t async_work_group_copy(__global int2 *, const __local int2 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl2I4(__global uint2 *dst, const __local uint2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl2I4"))) event_t async_work_group_strided_copy(__global uint2 *, const __local uint2 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl2I4"))) event_t async_work_group_strided_copy(__global int2 *, const __local int2 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uint2 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global int2 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg2I8(__local ulong2 * dst, const __global ulong2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg2I8"))) event_t async_work_group_copy(__local ulong2 *, const __global ulong2 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg2I8"))) event_t async_work_group_copy(__local long2 *, const __global long2 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg2I8(__local ulong2 *dst, const __global ulong2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg2I8"))) event_t async_work_group_strided_copy(__local ulong2 *, const __global ulong2 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg2I8"))) event_t async_work_group_strided_copy(__local long2 *, const __global long2 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl2I8(__global ulong2 * dst, const __local ulong2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl2I8"))) event_t async_work_group_copy(__global ulong2 *, const __local ulong2 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl2I8"))) event_t async_work_group_copy(__global long2 *, const __local long2 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl2I8(__global ulong2 *dst, const __local ulong2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl2I8"))) event_t async_work_group_strided_copy(__global ulong2 *, const __local ulong2 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl2I8"))) event_t async_work_group_strided_copy(__global long2 *, const __local long2 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ulong2 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global long2 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local float2 * dst, const __global float2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local float2 *dst, const __global float2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global float2 * dst, const __local float2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global float2 *dst, const __local float2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global float2 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local double2 * dst, const __global double2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local double2 *dst, const __global double2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global double2 * dst, const __local double2 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global double2 *dst, const __local double2 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global double2 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg3I1(__local uchar3 * dst, const __global uchar3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg3I1"))) event_t async_work_group_copy(__local uchar3 *, const __global uchar3 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg3I1"))) event_t async_work_group_copy(__local char3 *, const __global char3 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg3I1(__local uchar3 *dst, const __global uchar3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg3I1"))) event_t async_work_group_strided_copy(__local uchar3 *, const __global uchar3 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg3I1"))) event_t async_work_group_strided_copy(__local char3 *, const __global char3 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl3I1(__global uchar3 * dst, const __local uchar3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl3I1"))) event_t async_work_group_copy(__global uchar3 *, const __local uchar3 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl3I1"))) event_t async_work_group_copy(__global char3 *, const __local char3 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl3I1(__global uchar3 *dst, const __local uchar3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl3I1"))) event_t async_work_group_strided_copy(__global uchar3 *, const __local uchar3 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl3I1"))) event_t async_work_group_strided_copy(__global char3 *, const __local char3 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uchar3 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global char3 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg3I2(__local ushort3 * dst, const __global ushort3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg3I2"))) event_t async_work_group_copy(__local ushort3 *, const __global ushort3 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg3I2"))) event_t async_work_group_copy(__local short3 *, const __global short3 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg3I2(__local ushort3 *dst, const __global ushort3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg3I2"))) event_t async_work_group_strided_copy(__local ushort3 *, const __global ushort3 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg3I2"))) event_t async_work_group_strided_copy(__local short3 *, const __global short3 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl3I2(__global ushort3 * dst, const __local ushort3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl3I2"))) event_t async_work_group_copy(__global ushort3 *, const __local ushort3 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl3I2"))) event_t async_work_group_copy(__global short3 *, const __local short3 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl3I2(__global ushort3 *dst, const __local ushort3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl3I2"))) event_t async_work_group_strided_copy(__global ushort3 *, const __local ushort3 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl3I2"))) event_t async_work_group_strided_copy(__global short3 *, const __local short3 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ushort3 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global short3 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg3I4(__local uint3 * dst, const __global uint3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg3I4"))) event_t async_work_group_copy(__local uint3 *, const __global uint3 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg3I4"))) event_t async_work_group_copy(__local int3 *, const __global int3 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg3I4(__local uint3 *dst, const __global uint3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg3I4"))) event_t async_work_group_strided_copy(__local uint3 *, const __global uint3 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg3I4"))) event_t async_work_group_strided_copy(__local int3 *, const __global int3 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl3I4(__global uint3 * dst, const __local uint3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl3I4"))) event_t async_work_group_copy(__global uint3 *, const __local uint3 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl3I4"))) event_t async_work_group_copy(__global int3 *, const __local int3 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl3I4(__global uint3 *dst, const __local uint3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl3I4"))) event_t async_work_group_strided_copy(__global uint3 *, const __local uint3 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl3I4"))) event_t async_work_group_strided_copy(__global int3 *, const __local int3 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uint3 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global int3 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg3I8(__local ulong3 * dst, const __global ulong3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg3I8"))) event_t async_work_group_copy(__local ulong3 *, const __global ulong3 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg3I8"))) event_t async_work_group_copy(__local long3 *, const __global long3 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg3I8(__local ulong3 *dst, const __global ulong3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg3I8"))) event_t async_work_group_strided_copy(__local ulong3 *, const __global ulong3 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg3I8"))) event_t async_work_group_strided_copy(__local long3 *, const __global long3 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl3I8(__global ulong3 * dst, const __local ulong3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl3I8"))) event_t async_work_group_copy(__global ulong3 *, const __local ulong3 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl3I8"))) event_t async_work_group_copy(__global long3 *, const __local long3 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl3I8(__global ulong3 *dst, const __local ulong3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl3I8"))) event_t async_work_group_strided_copy(__global ulong3 *, const __local ulong3 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl3I8"))) event_t async_work_group_strided_copy(__global long3 *, const __local long3 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ulong3 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global long3 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local float3 * dst, const __global float3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local float3 *dst, const __global float3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global float3 * dst, const __local float3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global float3 *dst, const __local float3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global float3 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local double3 * dst, const __global double3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local double3 *dst, const __global double3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global double3 * dst, const __local double3 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global double3 *dst, const __local double3 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global double3 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg4I1(__local uchar4 * dst, const __global uchar4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg4I1"))) event_t async_work_group_copy(__local uchar4 *, const __global uchar4 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg4I1"))) event_t async_work_group_copy(__local char4 *, const __global char4 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg4I1(__local uchar4 *dst, const __global uchar4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg4I1"))) event_t async_work_group_strided_copy(__local uchar4 *, const __global uchar4 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg4I1"))) event_t async_work_group_strided_copy(__local char4 *, const __global char4 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl4I1(__global uchar4 * dst, const __local uchar4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl4I1"))) event_t async_work_group_copy(__global uchar4 *, const __local uchar4 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl4I1"))) event_t async_work_group_copy(__global char4 *, const __local char4 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl4I1(__global uchar4 *dst, const __local uchar4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl4I1"))) event_t async_work_group_strided_copy(__global uchar4 *, const __local uchar4 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl4I1"))) event_t async_work_group_strided_copy(__global char4 *, const __local char4 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uchar4 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global char4 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg4I2(__local ushort4 * dst, const __global ushort4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg4I2"))) event_t async_work_group_copy(__local ushort4 *, const __global ushort4 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg4I2"))) event_t async_work_group_copy(__local short4 *, const __global short4 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg4I2(__local ushort4 *dst, const __global ushort4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg4I2"))) event_t async_work_group_strided_copy(__local ushort4 *, const __global ushort4 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg4I2"))) event_t async_work_group_strided_copy(__local short4 *, const __global short4 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl4I2(__global ushort4 * dst, const __local ushort4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl4I2"))) event_t async_work_group_copy(__global ushort4 *, const __local ushort4 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl4I2"))) event_t async_work_group_copy(__global short4 *, const __local short4 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl4I2(__global ushort4 *dst, const __local ushort4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl4I2"))) event_t async_work_group_strided_copy(__global ushort4 *, const __local ushort4 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl4I2"))) event_t async_work_group_strided_copy(__global short4 *, const __local short4 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ushort4 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global short4 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg4I4(__local uint4 * dst, const __global uint4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg4I4"))) event_t async_work_group_copy(__local uint4 *, const __global uint4 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg4I4"))) event_t async_work_group_copy(__local int4 *, const __global int4 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg4I4(__local uint4 *dst, const __global uint4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg4I4"))) event_t async_work_group_strided_copy(__local uint4 *, const __global uint4 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg4I4"))) event_t async_work_group_strided_copy(__local int4 *, const __global int4 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl4I4(__global uint4 * dst, const __local uint4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl4I4"))) event_t async_work_group_copy(__global uint4 *, const __local uint4 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl4I4"))) event_t async_work_group_copy(__global int4 *, const __local int4 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl4I4(__global uint4 *dst, const __local uint4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl4I4"))) event_t async_work_group_strided_copy(__global uint4 *, const __local uint4 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl4I4"))) event_t async_work_group_strided_copy(__global int4 *, const __local int4 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uint4 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global int4 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg4I8(__local ulong4 * dst, const __global ulong4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg4I8"))) event_t async_work_group_copy(__local ulong4 *, const __global ulong4 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg4I8"))) event_t async_work_group_copy(__local long4 *, const __global long4 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg4I8(__local ulong4 *dst, const __global ulong4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg4I8"))) event_t async_work_group_strided_copy(__local ulong4 *, const __global ulong4 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg4I8"))) event_t async_work_group_strided_copy(__local long4 *, const __global long4 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl4I8(__global ulong4 * dst, const __local ulong4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl4I8"))) event_t async_work_group_copy(__global ulong4 *, const __local ulong4 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl4I8"))) event_t async_work_group_copy(__global long4 *, const __local long4 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl4I8(__global ulong4 *dst, const __local ulong4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl4I8"))) event_t async_work_group_strided_copy(__global ulong4 *, const __local ulong4 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl4I8"))) event_t async_work_group_strided_copy(__global long4 *, const __local long4 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ulong4 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global long4 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local float4 * dst, const __global float4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local float4 *dst, const __global float4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global float4 * dst, const __local float4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global float4 *dst, const __local float4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global float4 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local double4 * dst, const __global double4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local double4 *dst, const __global double4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global double4 * dst, const __local double4 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global double4 *dst, const __local double4 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global double4 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg8I1(__local uchar8 * dst, const __global uchar8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg8I1"))) event_t async_work_group_copy(__local uchar8 *, const __global uchar8 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg8I1"))) event_t async_work_group_copy(__local char8 *, const __global char8 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg8I1(__local uchar8 *dst, const __global uchar8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg8I1"))) event_t async_work_group_strided_copy(__local uchar8 *, const __global uchar8 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg8I1"))) event_t async_work_group_strided_copy(__local char8 *, const __global char8 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl8I1(__global uchar8 * dst, const __local uchar8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl8I1"))) event_t async_work_group_copy(__global uchar8 *, const __local uchar8 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl8I1"))) event_t async_work_group_copy(__global char8 *, const __local char8 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl8I1(__global uchar8 *dst, const __local uchar8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl8I1"))) event_t async_work_group_strided_copy(__global uchar8 *, const __local uchar8 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl8I1"))) event_t async_work_group_strided_copy(__global char8 *, const __local char8 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uchar8 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global char8 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg8I2(__local ushort8 * dst, const __global ushort8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg8I2"))) event_t async_work_group_copy(__local ushort8 *, const __global ushort8 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg8I2"))) event_t async_work_group_copy(__local short8 *, const __global short8 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg8I2(__local ushort8 *dst, const __global ushort8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg8I2"))) event_t async_work_group_strided_copy(__local ushort8 *, const __global ushort8 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg8I2"))) event_t async_work_group_strided_copy(__local short8 *, const __global short8 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl8I2(__global ushort8 * dst, const __local ushort8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl8I2"))) event_t async_work_group_copy(__global ushort8 *, const __local ushort8 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl8I2"))) event_t async_work_group_copy(__global short8 *, const __local short8 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl8I2(__global ushort8 *dst, const __local ushort8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl8I2"))) event_t async_work_group_strided_copy(__global ushort8 *, const __local ushort8 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl8I2"))) event_t async_work_group_strided_copy(__global short8 *, const __local short8 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ushort8 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global short8 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg8I4(__local uint8 * dst, const __global uint8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg8I4"))) event_t async_work_group_copy(__local uint8 *, const __global uint8 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg8I4"))) event_t async_work_group_copy(__local int8 *, const __global int8 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg8I4(__local uint8 *dst, const __global uint8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg8I4"))) event_t async_work_group_strided_copy(__local uint8 *, const __global uint8 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg8I4"))) event_t async_work_group_strided_copy(__local int8 *, const __global int8 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl8I4(__global uint8 * dst, const __local uint8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl8I4"))) event_t async_work_group_copy(__global uint8 *, const __local uint8 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl8I4"))) event_t async_work_group_copy(__global int8 *, const __local int8 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl8I4(__global uint8 *dst, const __local uint8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl8I4"))) event_t async_work_group_strided_copy(__global uint8 *, const __local uint8 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl8I4"))) event_t async_work_group_strided_copy(__global int8 *, const __local int8 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uint8 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global int8 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg8I8(__local ulong8 * dst, const __global ulong8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg8I8"))) event_t async_work_group_copy(__local ulong8 *, const __global ulong8 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg8I8"))) event_t async_work_group_copy(__local long8 *, const __global long8 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg8I8(__local ulong8 *dst, const __global ulong8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg8I8"))) event_t async_work_group_strided_copy(__local ulong8 *, const __global ulong8 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg8I8"))) event_t async_work_group_strided_copy(__local long8 *, const __global long8 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl8I8(__global ulong8 * dst, const __local ulong8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl8I8"))) event_t async_work_group_copy(__global ulong8 *, const __local ulong8 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl8I8"))) event_t async_work_group_copy(__global long8 *, const __local long8 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl8I8(__global ulong8 *dst, const __local ulong8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl8I8"))) event_t async_work_group_strided_copy(__global ulong8 *, const __local ulong8 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl8I8"))) event_t async_work_group_strided_copy(__global long8 *, const __local long8 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ulong8 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global long8 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local float8 * dst, const __global float8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local float8 *dst, const __global float8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global float8 * dst, const __local float8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global float8 *dst, const __local float8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global float8 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local double8 * dst, const __global double8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local double8 *dst, const __global double8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global double8 * dst, const __local double8 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global double8 *dst, const __local double8 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global double8 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg16I1(__local uchar16 * dst, const __global uchar16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg16I1"))) event_t async_work_group_copy(__local uchar16 *, const __global uchar16 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg16I1"))) event_t async_work_group_copy(__local char16 *, const __global char16 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg16I1(__local uchar16 *dst, const __global uchar16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg16I1"))) event_t async_work_group_strided_copy(__local uchar16 *, const __global uchar16 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg16I1"))) event_t async_work_group_strided_copy(__local char16 *, const __global char16 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl16I1(__global uchar16 * dst, const __local uchar16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl16I1"))) event_t async_work_group_copy(__global uchar16 *, const __local uchar16 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl16I1"))) event_t async_work_group_copy(__global char16 *, const __local char16 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl16I1(__global uchar16 *dst, const __local uchar16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl16I1"))) event_t async_work_group_strided_copy(__global uchar16 *, const __local uchar16 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl16I1"))) event_t async_work_group_strided_copy(__global char16 *, const __local char16 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uchar16 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global char16 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg16I2(__local ushort16 * dst, const __global ushort16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg16I2"))) event_t async_work_group_copy(__local ushort16 *, const __global ushort16 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg16I2"))) event_t async_work_group_copy(__local short16 *, const __global short16 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg16I2(__local ushort16 *dst, const __global ushort16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg16I2"))) event_t async_work_group_strided_copy(__local ushort16 *, const __global ushort16 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg16I2"))) event_t async_work_group_strided_copy(__local short16 *, const __global short16 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl16I2(__global ushort16 * dst, const __local ushort16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl16I2"))) event_t async_work_group_copy(__global ushort16 *, const __local ushort16 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl16I2"))) event_t async_work_group_copy(__global short16 *, const __local short16 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl16I2(__global ushort16 *dst, const __local ushort16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl16I2"))) event_t async_work_group_strided_copy(__global ushort16 *, const __local ushort16 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl16I2"))) event_t async_work_group_strided_copy(__global short16 *, const __local short16 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ushort16 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global short16 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg16I4(__local uint16 * dst, const __global uint16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg16I4"))) event_t async_work_group_copy(__local uint16 *, const __global uint16 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg16I4"))) event_t async_work_group_copy(__local int16 *, const __global int16 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg16I4(__local uint16 *dst, const __global uint16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg16I4"))) event_t async_work_group_strided_copy(__local uint16 *, const __global uint16 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg16I4"))) event_t async_work_group_strided_copy(__local int16 *, const __global int16 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl16I4(__global uint16 * dst, const __local uint16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl16I4"))) event_t async_work_group_copy(__global uint16 *, const __local uint16 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl16I4"))) event_t async_work_group_copy(__global int16 *, const __local int16 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl16I4(__global uint16 *dst, const __local uint16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl16I4"))) event_t async_work_group_strided_copy(__global uint16 *, const __local uint16 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl16I4"))) event_t async_work_group_strided_copy(__global int16 *, const __local int16 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global uint16 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global int16 *p, size_t n) +{ + // nothing to do +} + +__attribute__((always_inline)) static event_t +__AWGClg16I8(__local ulong16 * dst, const __global ulong16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGClg16I8"))) event_t async_work_group_copy(__local ulong16 *, const __global ulong16 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGClg16I8"))) event_t async_work_group_copy(__local long16 *, const __global long16 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSClg16I8(__local ulong16 *dst, const __global ulong16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSClg16I8"))) event_t async_work_group_strided_copy(__local ulong16 *, const __global ulong16 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSClg16I8"))) event_t async_work_group_strided_copy(__local long16 *, const __global long16 *, size_t, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGCgl16I8(__global ulong16 * dst, const __local ulong16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGCgl16I8"))) event_t async_work_group_copy(__global ulong16 *, const __local ulong16 *, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGCgl16I8"))) event_t async_work_group_copy(__global long16 *, const __local long16 *, size_t, event_t); + +__attribute__((always_inline)) static event_t +__AWGSCgl16I8(__global ulong16 *dst, const __local ulong16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +extern __attribute__((overloadable, weak, alias("__AWGSCgl16I8"))) event_t async_work_group_strided_copy(__global ulong16 *, const __local ulong16 *, size_t, size_t, event_t); +extern __attribute__((overloadable, weak, alias("__AWGSCgl16I8"))) event_t async_work_group_strided_copy(__global long16 *, const __local long16 *, size_t, size_t, event_t); + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global ulong16 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global long16 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local float16 * dst, const __global float16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local float16 *dst, const __global float16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global float16 * dst, const __local float16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global float16 *dst, const __local float16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global float16 *p, size_t n) +{ + // nothing to do +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__local double16 * dst, const __global double16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__local double16 *dst, const __global double16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i*j]; + i += d; + } + barrier(CLK_LOCAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_copy(__global double16 * dst, const __local double16 * src, size_t n, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) event_t +async_work_group_strided_copy(__global double16 *dst, const __local double16 *src, size_t n, size_t j, event_t e) +{ + int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0); + size_t i = __hsail_workitemid_flat(); + size_t d = ls.x * ls.y * ls.z; + while (i < n) { + dst[i*j] = src[i]; + i += d; + } + barrier(CLK_GLOBAL_MEM_FENCE); + return e; +} + +__attribute__((overloadable, always_inline, weak)) void +prefetch(const __global double16 *p, size_t n) +{ + // nothing to do +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) void +wait_group_events(int num_events, event_t *event_list) +{ + // Nothing to do +}
diff --git a/amd-builtins/misc/bitsel.cl b/amd-builtins/misc/bitsel.cl new file mode 100644 index 0000000..2a12ffc --- /dev/null +++ b/amd-builtins/misc/bitsel.cl
@@ -0,0 +1,65 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +extern __attribute__((pure)) uint __amdil_bfi_u32(uint, uint, uint); + +// [u]int + +__attribute__((always_inline)) static uint +__BSELI4(uint a, uint b, uint c) +{ + return __amdil_bfi_u32(c, b, a); +} + +extern __attribute__((overloadable, alias("__BSELI4"))) uint bitselect(uint, uint, uint); +extern __attribute__((overloadable, alias("__BSELI4"))) int bitselect(int, int, int); + +// float + +__attribute__((overloadable, always_inline)) float +bitselect(float a, float b, float c) +{ + return as_float(__amdil_bfi_u32(as_uint(c), as_uint(b), as_uint(a))); +} + +// [u]long + +// No __amdil equivalent, so use __hsail intrinsic here +extern __attribute__((const)) ulong __hsail_bitselect_u64(ulong, ulong, ulong); + +__attribute__((always_inline)) static ulong +__BSELI8(ulong a, ulong b, ulong c) +{ + return __hsail_bitselect_u64(c, b, a); +} + +extern __attribute__((overloadable, alias("__BSELI8"))) ulong bitselect(ulong, ulong, ulong); +extern __attribute__((overloadable, alias("__BSELI8"))) long bitselect(long, long, long); + +// double + +__attribute__((overloadable, always_inline)) double +bitselect(double a, double b, double c) +{ + return as_double(__hsail_bitselect_u64(as_ulong(c), as_ulong(b), as_ulong(a))); +} +
diff --git a/amd-builtins/misc/class.cl b/amd-builtins/misc/class.cl new file mode 100644 index 0000000..17f593e --- /dev/null +++ b/amd-builtins/misc/class.cl
@@ -0,0 +1,186 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#define SNAN 0x001 +#define QNAN 0x002 +#define NINF 0x004 +#define NNOR 0x008 +#define NSUB 0x010 +#define NZER 0x020 +#define PZER 0x040 +#define PSUB 0x080 +#define PNOR 0x100 +#define PINF 0x200 + +extern __attribute__((pure)) int __amdil_class_f32(float, int); +extern __attribute__((pure)) int __amdil_class_f64(double, int); + +#define FC(F,M) \ +__attribute__((overloadable, always_inline)) int \ +F(float x) \ +{ \ + return __amdil_class_f32(x, M) & 1; \ +} \ +__attribute__((overloadable, always_inline)) int2 \ +F(float2 x) \ +{ \ + int2 ret; \ + ret.s0 = __amdil_class_f32(x.s0, M); \ + ret.s1 = __amdil_class_f32(x.s1, M); \ + return ret; \ +} \ +__attribute__((overloadable, always_inline)) int3 \ +F(float3 x) \ +{ \ + int3 ret; \ + ret.s0 = __amdil_class_f32(x.s0, M); \ + ret.s1 = __amdil_class_f32(x.s1, M); \ + ret.s2 = __amdil_class_f32(x.s2, M); \ + return ret; \ +} \ +__attribute__((overloadable, always_inline)) int4 \ +F(float4 x) \ +{ \ + int4 ret; \ + ret.s0 = __amdil_class_f32(x.s0, M); \ + ret.s1 = __amdil_class_f32(x.s1, M); \ + ret.s2 = __amdil_class_f32(x.s2, M); \ + ret.s3 = __amdil_class_f32(x.s3, M); \ + return ret; \ +} \ +__attribute__((overloadable, always_inline)) int8 \ +F(float8 x) \ +{ \ + int8 ret; \ + ret.s0 = __amdil_class_f32(x.s0, M); \ + ret.s1 = __amdil_class_f32(x.s1, M); \ + ret.s2 = __amdil_class_f32(x.s2, M); \ + ret.s3 = __amdil_class_f32(x.s3, M); \ + ret.s4 = __amdil_class_f32(x.s4, M); \ + ret.s5 = __amdil_class_f32(x.s5, M); \ + ret.s6 = __amdil_class_f32(x.s6, M); \ + ret.s7 = __amdil_class_f32(x.s7, M); \ + return ret; \ +} \ +__attribute__((overloadable, always_inline)) int16 \ +F(float16 x) \ +{ \ + int16 ret; \ + ret.s0 = __amdil_class_f32(x.s0, M); \ + ret.s1 = __amdil_class_f32(x.s1, M); \ + ret.s2 = __amdil_class_f32(x.s2, M); \ + ret.s3 = __amdil_class_f32(x.s3, M); \ + ret.s4 = __amdil_class_f32(x.s4, M); \ + ret.s5 = __amdil_class_f32(x.s5, M); \ + ret.s6 = __amdil_class_f32(x.s6, M); \ + ret.s7 = __amdil_class_f32(x.s7, M); \ + ret.s8 = __amdil_class_f32(x.s8, M); \ + ret.s9 = __amdil_class_f32(x.s9, M); \ + ret.sa = __amdil_class_f32(x.sa, M); \ + ret.sb = __amdil_class_f32(x.sb, M); \ + ret.sc = __amdil_class_f32(x.sc, M); \ + ret.sd = __amdil_class_f32(x.sd, M); \ + ret.se = __amdil_class_f32(x.se, M); \ + ret.sf = __amdil_class_f32(x.sf, M); \ + return ret; \ +} + + +#define DC(F,M) \ +__attribute__((overloadable, always_inline)) int \ +F(double x) \ +{ \ + return __amdil_class_f64(x, M) & 1; \ +} \ +__attribute__((overloadable, always_inline)) long2 \ +F(double2 x) \ +{ \ + long2 ret; \ + ret.s0 = __amdil_class_f64(x.s0, M); \ + ret.s1 = __amdil_class_f64(x.s1, M); \ + return ret; \ +} \ +__attribute__((overloadable, always_inline)) long3 \ +F(double3 x) \ +{ \ + long3 ret; \ + ret.s0 = __amdil_class_f64(x.s0, M); \ + ret.s1 = __amdil_class_f64(x.s1, M); \ + ret.s2 = __amdil_class_f64(x.s2, M); \ + return ret; \ +} \ +__attribute__((overloadable, always_inline)) long4 \ +F(double4 x) \ +{ \ + long4 ret; \ + ret.s0 = __amdil_class_f64(x.s0, M); \ + ret.s1 = __amdil_class_f64(x.s1, M); \ + ret.s2 = __amdil_class_f64(x.s2, M); \ + ret.s3 = __amdil_class_f64(x.s3, M); \ + return ret; \ +} \ +__attribute__((overloadable, always_inline)) long8 \ +F(double8 x) \ +{ \ + long8 ret; \ + ret.s0 = __amdil_class_f64(x.s0, M); \ + ret.s1 = __amdil_class_f64(x.s1, M); \ + ret.s2 = __amdil_class_f64(x.s2, M); \ + ret.s3 = __amdil_class_f64(x.s3, M); \ + ret.s4 = __amdil_class_f64(x.s4, M); \ + ret.s5 = __amdil_class_f64(x.s5, M); \ + ret.s6 = __amdil_class_f64(x.s6, M); \ + ret.s7 = __amdil_class_f64(x.s7, M); \ + return ret; \ +} \ +__attribute__((overloadable, always_inline)) long16 \ +F(double16 x) \ +{ \ + long16 ret; \ + ret.s0 = __amdil_class_f64(x.s0, M); \ + ret.s1 = __amdil_class_f64(x.s1, M); \ + ret.s2 = __amdil_class_f64(x.s2, M); \ + ret.s3 = __amdil_class_f64(x.s3, M); \ + ret.s4 = __amdil_class_f64(x.s4, M); \ + ret.s5 = __amdil_class_f64(x.s5, M); \ + ret.s6 = __amdil_class_f64(x.s6, M); \ + ret.s7 = __amdil_class_f64(x.s7, M); \ + ret.s8 = __amdil_class_f64(x.s8, M); \ + ret.s9 = __amdil_class_f64(x.s9, M); \ + ret.sa = __amdil_class_f64(x.sa, M); \ + ret.sb = __amdil_class_f64(x.sb, M); \ + ret.sc = __amdil_class_f64(x.sc, M); \ + ret.sd = __amdil_class_f64(x.sd, M); \ + ret.se = __amdil_class_f64(x.se, M); \ + ret.sf = __amdil_class_f64(x.sf, M); \ + return ret; \ +} + +FC(isfinite, (NNOR|NSUB|NZER|PZER|PSUB|PNOR)) +FC(isinf, (NINF|PINF)) +FC(isnan, (SNAN|QNAN)) +FC(isnormal, (NNOR|PNOR)) + +DC(isfinite, (NNOR|NSUB|NZER|PZER|PSUB|PNOR)) +DC(isinf, (NINF|PINF)) +DC(isnan, (SNAN|QNAN)) +DC(isnormal, (NNOR|PNOR)) +
diff --git a/amd-builtins/misc/counter.cl b/amd-builtins/misc/counter.cl new file mode 100644 index 0000000..8aef73b --- /dev/null +++ b/amd-builtins/misc/counter.cl
@@ -0,0 +1,44 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifdef USE_COUNTER + +#pragma OPENCL EXTENSION cl_amd_atomic_counters32 : enable + +extern uint __amdil_append_alloc_i32(counter32_t); +extern uint __amdil_append_consume_i32(counter32_t); + +__attribute__((overloadable, always_inline)) uint +atomic_inc(counter32_t p) +{ + return __amdil_append_alloc_i32(p); +} + +__attribute__((overloadable, always_inline)) uint +atomic_dec(counter32_t p) +{ + // The instruction returns the updated value + return __amdil_append_consume_i32(p) + 1U; +} + +#endif +
diff --git a/amd-builtins/misc/floattointconversion.h b/amd-builtins/misc/floattointconversion.h new file mode 100644 index 0000000..dc1f7f2 --- /dev/null +++ b/amd-builtins/misc/floattointconversion.h
@@ -0,0 +1,79 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + + + +static inline double float_uint_to_double(uint x) +{ + double d; + float f = as_float(x); + + // Fix up subnormal, if necessary + uint fmant = x & 0x007fffff; + float temp = as_float(fmant | 0x3f800000); + temp -= 1.0; + d = (float)temp; + ulong ld = as_ulong(d); + ld -= 0x07e0000000000000; + d = as_double(ld); + d = fmant ? d : 0.0; + d = x & 0x80000000 ? -d : d; + d = (f != 0.0) ? (double)f : d; + + return d; + +} + +static inline uint double_to_float_uint(double d) +{ + uint dlow, dhigh, dsign; + float f = (float)d; + uint uf; + + double dabs = (d < 0.) ? -d : d; + + // Fix up subnormal + ulong ld; + ld = as_ulong(d); + dlow = ld; + dhigh = ld >> 32; + dsign = dhigh & 0x80000000; + + int dexp = (dhigh >> 20) & 0x7ff; + int shiftcount = 0x381 - dexp; + dhigh &= 0x000fffff; + dhigh |= 0x00100000; + dhigh = (dhigh << 3) | (dlow >> 29); + dlow <<= 3; + uint extrabits = dlow << (32 - shiftcount); + dlow = (dlow >> shiftcount) | (dhigh << (32 - shiftcount)); + dhigh >>= shiftcount; + dhigh = ((dlow > 0x80000000u) || + ((dlow == 0x80000000u) && ((dhigh & 1) | extrabits))) ? + dhigh + 1 : dhigh; + uf = dhigh | dsign; + uf = dabs >= 7.0064923216240869000000e-046 ? uf : 0; + + + uf = f != 0. ? as_uint(f) : uf; + return uf; +} \ No newline at end of file
diff --git a/amd-builtins/misc/minmax.cl b/amd-builtins/misc/minmax.cl new file mode 100644 index 0000000..eaf6ef1 --- /dev/null +++ b/amd-builtins/misc/minmax.cl
@@ -0,0 +1,130 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +//#define G(F,T,N) \ +//__attribute__((overloadable, always_inline)) T##N \ +//F(T##N x, T##N y) \ +//{ \ +// T##N ret; \ +// ret.lo = F(x.lo, y.lo); \ +// ret.hi = F(x.hi, y.hi); \ +// return ret; \ +//} +// +//G(min,float,16) +//G(min,float,8) + +//__attribute__((overloadable, always_inline)) float4 +//min(float4 x, float4 y) +//{ +// return __amdil_min_v4f32(x, y); +//} +// +//__attribute__((overloadable, always_inline)) float3 +//min(float3 x, float3 y) +//{ +//#if defined VEC3_BACKEND +// return __amdil_min_v3f32(x, y); +//#else +// float3 ret; +// ret.xy = min(x.xy, y.xy); +// ret.z = min(x.z, y.z); +// return ret; +//#endif +//} +// +//__attribute__((overloadable, always_inline)) float2 +//min(float2 x, float2 y) +//{ +// return __amdil_min_v2f32(x, y); +//} + +extern __attribute__((pure)) float __hsail_min_f32(float,float); + +__attribute__((weak, overloadable, always_inline)) float +min(float x, float y) +{ + return __hsail_min_f32(x, y); +} + +//G(min,double,16) +//G(min,double,8) +//G(min,double,4) +//G(min,double,3) +//G(min,double,2) + +extern __attribute__((pure)) double __hsail_min_f64(double,double); + +__attribute__((weak, overloadable, always_inline)) double +min(double x, double y) +{ + return __hsail_min_f64(x, y); +} + +//G(max,float,16) +//G(max,float,8) +// +//__attribute__((overloadable, always_inline)) float4 +//max(float4 x, float4 y) +//{ +// return __amdil_max_v4f32(x, y); +//} +// +//__attribute__((overloadable, always_inline)) float3 +//max(float3 x, float3 y) +//{ +//#if defined VEC3_BACKEND +// return __amdil_max_v3f32(x, y); +//#else +// float3 ret; +// ret.xy = max(x.xy, y.xy); +// ret.z = max(x.z, y.z); +// return ret; +//#endif +//} +// +//__attribute__((overloadable, always_inline)) float2 +//max(float2 x, float2 y) +//{ +// return __amdil_max_v2f32(x, y); +//} + +extern __attribute__((pure)) float __hsail_max_f32(float,float); + +__attribute__((weak, overloadable, always_inline)) float +max(float x, float y) +{ + return __hsail_max_f32(x, y); +} + +//G(max,double,16) +//G(max,double,8) +//G(max,double,4) +//G(max,double,3) +//G(max,double,2) + +extern __attribute__((pure)) double __hsail_max_f64(double,double); + +__attribute__((weak, overloadable, always_inline)) double +max(double x, double y) +{ + return __hsail_max_f64(x, y); +}
diff --git a/amd-builtins/misc/printf_alloc.cl b/amd-builtins/misc/printf_alloc.cl new file mode 100644 index 0000000..7a1f277 --- /dev/null +++ b/amd-builtins/misc/printf_alloc.cl
@@ -0,0 +1,65 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#if __OPENCL_C_VERSION__ >= 200 + +#ifndef NULL +#define NULL 0 +#endif + +extern __attribute__((const)) uint __hsail_ld_kernarg_u32(uint); +extern __attribute__((const)) ulong __hsail_ld_kernarg_u64(uint); + +#define OFFSET 8 + +__global char* __printf_alloc(unsigned int bytes) +{ + // Functionality: + // The __get_printf_ptr is a builtin that is replaced by + // the backend. The first 8 bytes of the buffer returned + // by the call are skipped. + // buffer[0] maintains the latest offset in the buffer. The value + // is updated using atomic adds for the number of bytes + // requested in the function argument. + // buffer[4] has the size of the buffer + // when access needs to go over buffer[0] + size of buffer + // i.e. we have the buffer overflow condition -- we return NULL + // The buffer size is hard limited by sizeof(uint) + // + __global char* ptr; + if (sizeof(size_t) == 4) + ptr = (__global char*) __hsail_ld_kernarg_u32(12); + else + ptr = (__global char*) __hsail_ld_kernarg_u64(24); + uint size = ((global uint *)ptr)[1]; + uint offset = atomic_load_explicit((__global atomic_uint *)ptr, + memory_order_acquire, memory_scope_device); + for (;;) { + if (OFFSET + offset + bytes > size) + return NULL; + if (atomic_compare_exchange_strong_explicit((__global atomic_uint *)ptr, + &offset, offset+bytes, memory_order_acq_rel, memory_order_acquire, + memory_scope_device)) + break; + } + return ptr + OFFSET + offset; +} +#endif
diff --git a/amd-builtins/misc/relationals.cl b/amd-builtins/misc/relationals.cl new file mode 100644 index 0000000..b220128 --- /dev/null +++ b/amd-builtins/misc/relationals.cl
@@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// Vector expansions for HSAIL relationals + +#define UnaryRelationalVector(oty, ity, fun, mgl) \ +__attribute__((weak,always_inline)) \ +oty##16 __##fun##_16##mgl(ity##16 a) \ +{ \ + oty##16 c; \ + c.lo = fun(a.lo); \ + c.hi = fun(a.hi); \ + return c; \ +} \ +__attribute__((weak,always_inline)) \ +oty##8 __##fun##_8##mgl(ity##8 a) \ +{ \ + oty##8 c; \ + c.lo = fun(a.lo); \ + c.hi = fun(a.hi); \ + return c; \ +} \ +__attribute__((weak,always_inline)) \ +oty##4 __##fun##_4##mgl(ity##4 a) \ +{ \ + oty##4 c; \ + c.lo = fun(a.lo); \ + c.hi = fun(a.hi); \ + return c; \ +} \ +__attribute__((weak,always_inline)) \ +oty##3 __##fun##_3##mgl(ity##3 a) \ +{ \ + oty##3 c; \ + c.xy = fun(a.xy); \ + c.z = fun(a.z); \ + return c; \ +} \ +__attribute__((weak,always_inline)) \ +oty##2 __##fun##_2##mgl(ity##2 a) \ +{ \ + oty##2 c; \ + c.lo = fun(a.lo); \ + c.hi = fun(a.hi); \ + return c; \ +} + +UnaryRelationalVector(int, float, isfinite, f32) +UnaryRelationalVector(long, double, isfinite, f64) + +UnaryRelationalVector(int, float, isinf, f32) +UnaryRelationalVector(long, double, isinf, f64) + +UnaryRelationalVector(int, float, isnan, f32) +UnaryRelationalVector(long, double, isnan, f64) + +UnaryRelationalVector(int, float, isnormal, f32) +UnaryRelationalVector(long, double, isnormal, f64) +
diff --git a/amd-builtins/misc/synchronization.cl b/amd-builtins/misc/synchronization.cl new file mode 100644 index 0000000..2e29c50 --- /dev/null +++ b/amd-builtins/misc/synchronization.cl
@@ -0,0 +1,64 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +extern void __hsail_memfence(); +extern void __hsail_memfence_global(); +extern void __hsail_memfence_group(); +extern void __hsail_barrier(); + +void mem_fence_impl(uint val) { + if (val == CLK_GLOBAL_MEM_FENCE) { + __hsail_memfence_global(); + } else if (val == CLK_LOCAL_MEM_FENCE) { + __hsail_memfence_group(); + } else { + __hsail_memfence(); + } +} +#ifdef __clang__ +__attribute__((overloadable)) +#endif +void mem_fence(uint val) { + mem_fence_impl(val); +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +void read_mem_fence(uint val) { + mem_fence_impl(val); +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +void write_mem_fence(uint val) { + mem_fence_impl(val); +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) +void barrier(uint flags) { + __hsail_barrier(); +}
diff --git a/amd-builtins/misc/workitem.cl b/amd-builtins/misc/workitem.cl new file mode 100644 index 0000000..01244a0 --- /dev/null +++ b/amd-builtins/misc/workitem.cl
@@ -0,0 +1,238 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +extern __attribute__((const)) uint __hsail_get_global_size(uint); +extern __attribute__((const)) uint __hsail_get_global_id(uint); +extern __attribute__((const)) uint __hsail_workgroup_size(uint); +extern __attribute__((const)) uint __hsail_currentworkgroup_size(uint); +extern __attribute__((const)) uint __hsail_get_local_id(uint); +extern __attribute__((const)) uint __hsail_get_num_groups(uint); +extern __attribute__((const)) uint __hsail_get_group_id(uint); +extern __attribute__((const)) uint __hsail_get_work_dim(void); +extern __attribute__((const)) uint __hsail_ld_kernarg_u32(uint); +extern __attribute__((const)) ulong __hsail_ld_kernarg_u64(uint); +extern __attribute__((pure)) uint __hsail_workitemid_flat(void); + +// FIXME - this will change to ulong soon +extern __attribute__((pure)) uint __hsail_workitemid_flatabs(void); + +#ifdef __clang__ + __attribute__((always_inline, overloadable)) +#else +__attribute__((always_inline)) +#endif +size_t get_global_offset(uint d) { + if (sizeof(size_t) == 4) { // 32 bit + switch(d) { + default: + return 0; + case 0: + return __hsail_ld_kernarg_u32(0); + case 1: + return __hsail_ld_kernarg_u32(4); + case 2: + return __hsail_ld_kernarg_u32(8); + } + } else { // 64 bit + switch(d) { + default: + return 0; + case 0: + return __hsail_ld_kernarg_u64(0); + case 1: + return __hsail_ld_kernarg_u64(8); + case 2: + return __hsail_ld_kernarg_u64(16); + } + } +} + +#ifdef __clang__ + __attribute__((always_inline, overloadable)) +#else +__attribute__((always_inline)) +#endif +size_t get_global_id(uint d) { + size_t id; + size_t o = get_global_offset(d); + switch(d) { + default: + id = 0; + break; + case 0: + id = __hsail_get_global_id(0); + break; + case 1: + id = __hsail_get_global_id(1); + break; + case 2: + id = __hsail_get_global_id(2); + break; + } + + return o + id; +} + +#ifdef __clang__ + __attribute__((always_inline, overloadable)) +#else +__attribute__((always_inline)) +#endif +size_t get_local_id(uint d) { + switch(d) { + default: + return 0; + case 0: + return __hsail_get_local_id(0); + case 1: + return __hsail_get_local_id(1); + case 2: + return __hsail_get_local_id(2); + } +} + +#ifdef __clang__ + __attribute__((always_inline, overloadable)) +#else +__attribute__((always_inline)) +#endif +size_t get_group_id(uint d) { + switch(d) { + default: + return 0; + case 0: + return __hsail_get_group_id(0); + case 1: + return __hsail_get_group_id(1); + case 2: + return __hsail_get_group_id(2); + } +} + +#ifdef __clang__ + __attribute__((always_inline, overloadable)) +#else +__attribute__((always_inline)) +#endif +size_t get_global_size(uint d) { + switch(d) { + default: + return 1; + case 0: + return __hsail_get_global_size(0); + case 1: + return __hsail_get_global_size(1); + case 2: + return __hsail_get_global_size(2); + } +} + +#ifdef __clang__ + __attribute__((always_inline, overloadable)) +#else +__attribute__((always_inline)) +#endif +size_t get_local_size(uint d) { + switch(d) { + default: + return 1; + case 0: + return __hsail_currentworkgroup_size(0); + case 1: + return __hsail_currentworkgroup_size(1); + case 2: + return __hsail_currentworkgroup_size(2); + } +} + +#ifdef __clang__ + __attribute__((always_inline, overloadable)) +#else +__attribute__((always_inline)) +#endif +size_t get_num_groups(uint d) { + switch(d) { + default: + return 1; + case 0: + return __hsail_get_num_groups(0); + case 1: + return __hsail_get_num_groups(1); + case 2: + return __hsail_get_num_groups(2); + } +} + +#ifdef __clang__ + __attribute__((always_inline, overloadable)) +#else +__attribute__((always_inline)) +#endif +uint get_work_dim() { + return __hsail_get_work_dim(); +} + +#if __OPENCL_C_VERSION__ >= 200 +#ifdef __clang__ + __attribute__((always_inline, overloadable)) +#else +__attribute__((always_inline)) +#endif +size_t get_enqueued_local_size(uint d) { + switch(d) { + default: + return 1; + case 0: + return __hsail_workgroup_size(0); + case 1: + return __hsail_workgroup_size(1); + case 2: + return __hsail_workgroup_size(2); + } +} + +#ifdef __clang__ + __attribute__((always_inline, overloadable)) +#else +__attribute__((always_inline)) +#endif +size_t get_global_linear_id(void) { +#if defined NO_WORKITEM_FLATABS + return (__hsail_get_global_id(2) * __hsail_get_global_size(1) + + __hsail_get_global_id(1)) * __hsail_get_global_size(0) + + __hsail_get_global_id(0); +#else + return __hsail_workitemid_flatabs(); +#endif +} + +#ifdef __clang__ + __attribute__((always_inline, overloadable)) +#else +__attribute__((always_inline)) +#endif +size_t get_local_linear_id(void) { + return __hsail_workitemid_flat(); +} + +#endif +
diff --git a/amd-builtins/pipes/commitp.cl b/amd-builtins/pipes/commitp.cl new file mode 100644 index 0000000..9068c9b --- /dev/null +++ b/amd-builtins/pipes/commitp.cl
@@ -0,0 +1,109 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +// +// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved. +// + +#include "pipes.h" + +#define __COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) void \ +__commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +DO_PIPE_INTERNAL_SIZE(__COMMIT_READ_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) void +__commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size) +{ +} + +#define __COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) void \ +__commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +DO_PIPE_INTERNAL_SIZE(__COMMIT_WRITE_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) void +__commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size) +{ +} + +// Work group functions + +#define __WORK_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) void \ +__work_group_commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) void +__work_group_commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size) +{ +} + +#define __WORK_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) void \ +__work_group_commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) void +__work_group_commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size) +{ +} + +// sub group functions + +#define __SUB_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) void \ +__sub_group_commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) void +__sub_group_commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size) +{ +} + +#define __SUB_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) void \ +__sub_group_commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) void +__sub_group_commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size) +{ +} +
diff --git a/amd-builtins/pipes/getp.cl b/amd-builtins/pipes/getp.cl new file mode 100644 index 0000000..896a9f5 --- /dev/null +++ b/amd-builtins/pipes/getp.cl
@@ -0,0 +1,62 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// +// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved. +// + +#include "pipes.h" + +#define __GET_PIPE_NUM_PACKETS_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) uint \ +__get_pipe_num_packets_internal_##SIZE(__global struct pipeimp* p) \ +{ \ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + return (uint)(wi - ri); \ +} + +DO_PIPE_INTERNAL_SIZE(__GET_PIPE_NUM_PACKETS_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) uint +__get_pipe_num_packets_internal_user(__global struct pipeimp* p, size_t size) +{ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); + return (uint)(wi - ri); +} + +#define __GET_PIPE_MAX_PACKETS_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) uint \ +__get_pipe_max_packets_internal_##SIZE(__global struct pipeimp* p) \ +{ \ + return (uint)p->end_idx; \ +} + +DO_PIPE_INTERNAL_SIZE(__GET_PIPE_MAX_PACKETS_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) uint +__get_pipe_max_packets_internal_user(__global struct pipeimp* p, size_t size) +{ + return (uint)p->end_idx; +} +
diff --git a/amd-builtins/pipes/memcpyia.cl b/amd-builtins/pipes/memcpyia.cl new file mode 100644 index 0000000..9f57046 --- /dev/null +++ b/amd-builtins/pipes/memcpyia.cl
@@ -0,0 +1,70 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +__attribute__((always_inline, weak)) void +__memcpy_internal_aligned(void *d, const void *s, size_t size, size_t align) +{ + if (align == 2) { + short *d2 = (short *)d; + short *s2 = (short *)s; + short *e2 = s2 + size/2; + + while (s2 < e2) + *d2++ = *s2++; + } else if (align == 4) { + int *d4 = (int *)d; + int *s4 = (int *)s; + int *e4 = s4 + size/4; + + while (s4 < e4) + *d4++ = *s4++; + } else if (align == 8) { + long *d8 = (long *)d; + long *s8 = (long *)s; + long *e8 = s8 + size/8; + + while (s8 < e8) + *d8++ = *s8++; + } else if (align == 16) { + long2 *d16 = (long2 *)d; + long2 *s16 = (long2 *)s; + long2 *e16 = s16 + size/16; + + while (s16 < e16) + *d16++ = *s16++; + } else if (align == 32 || align == 64 || align == 128) { + long4 *d32 = (long4 *)d; + long4 *s32 = (long4 *)s; + long4 *e32 = s32 + size/32; + + while (s32 < e32) + *d32++ = *s32++; + } else { + char *d1 = (char *)d; + char *s1 = (char *)s; + char *e1 = s1 + size; + + while (s1 < e1) + *d1++ = *s1++; + } +} +
diff --git a/amd-builtins/pipes/pipes.h b/amd-builtins/pipes/pipes.h new file mode 100644 index 0000000..7a98fc1 --- /dev/null +++ b/amd-builtins/pipes/pipes.h
@@ -0,0 +1,69 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +// +// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved. +// + +#ifndef _PIPES_H +#define _PIPES_H 1 + +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable + +#define DO_PIPE_INTERNAL_SIZE(F) \ +F(1,uchar) \ +F(2,ushort) \ +F(4,uint) \ +F(8,ulong) \ +F(16,ulong2) \ +F(32,ulong4) \ +F(64,ulong8) \ +F(128,ulong16) + +struct pipeimp { + atomic_size_t read_idx; + atomic_size_t write_idx; + size_t end_idx; + uchar pad[128 - 3*sizeof(size_t)]; + uchar packets[1]; +}; + +extern void __memcpy_internal_aligned(void *, const void *, size_t, size_t); + +static inline size_t +reserve(volatile __global atomic_size_t *pidx, size_t lim, size_t n) +{ + size_t idx = atomic_load_explicit(pidx, memory_order_acquire, memory_scope_device); + + for (;;) { + if (idx + n > lim) + return ~(size_t)0; + + if (atomic_compare_exchange_strong_explicit(pidx, &idx, idx + n, memory_order_acq_rel, memory_order_acquire, memory_scope_device)) + break; + } + + return idx; +} + +#endif // _PIPES_H +
diff --git a/amd-builtins/pipes/readp.cl b/amd-builtins/pipes/readp.cl new file mode 100644 index 0000000..7613d3f --- /dev/null +++ b/amd-builtins/pipes/readp.cl
@@ -0,0 +1,88 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +// +// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved. +// + +#include "pipes.h" + +#define __READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) int \ +__read_pipe_internal_##SIZE(__global struct pipeimp* p, STYPE* ptr) \ +{ \ + size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + size_t ri = reserve(&p->read_idx, wi, 1); \ + if (ri == ~(size_t)0) \ + return -1; \ + \ + *ptr = ((__global STYPE *)p->packets)[ri % p->end_idx]; \ + \ + if (ri == wi-1) { \ + atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \ + atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + }\ +\ + return 0; \ +} + +DO_PIPE_INTERNAL_SIZE(__READ_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) int +__read_pipe_internal_user( __global struct pipeimp* p, void* ptr, size_t size, size_t align) +{ + size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); + size_t ri = reserve(&p->read_idx, wi, 1); + if (ri == ~(size_t)0) + return -1; + + __memcpy_internal_aligned(ptr, p->packets + (ri % p->end_idx)*size, size, align); + + if (ri == wi-1) { + atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); + atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + + return 0; +} + +#define __READ_PIPE_INDEXED_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) int \ +__read_pipe_reserved_internal_##SIZE(__global struct pipeimp* p, size_t rid, uint i, STYPE* ptr) \ +{ \ + rid += i; \ + *ptr = ((__global STYPE *)p->packets)[rid % p->end_idx]; \ + \ + return 0; \ +} + +DO_PIPE_INTERNAL_SIZE(__READ_PIPE_INDEXED_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) int +__read_pipe_reserved_internal_user(__global struct pipeimp* p, size_t rid, uint i, void *ptr, size_t size, size_t align) +{ + rid += i; + + __memcpy_internal_aligned(ptr, p->packets + (rid % p->end_idx)*size, size, align); + + return 0; +} +
diff --git a/amd-builtins/pipes/reservep.cl b/amd-builtins/pipes/reservep.cl new file mode 100644 index 0000000..991041e --- /dev/null +++ b/amd-builtins/pipes/reservep.cl
@@ -0,0 +1,235 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +// +// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved. +// + +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#include "pipes.h" +#include "../workgroup/wg.h" + +#define __RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) size_t \ +__reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + size_t rid = reserve(&p->read_idx, wi, num_packets); \ + \ + if (rid + num_packets == wi) { \ + atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \ + atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + } \ + \ + return rid; \ +} + +DO_PIPE_INTERNAL_SIZE(__RESERVE_READ_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) size_t +__reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size) +{ + size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); + size_t rid = reserve(&p->read_idx, wi, num_packets); + + if (rid + num_packets == wi) { + atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); + atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + + return rid; +} + +#define __RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) size_t \ +__reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + return reserve(&p->write_idx, ri + ei, num_packets); \ +} + +DO_PIPE_INTERNAL_SIZE(__RESERVE_WRITE_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) size_t +__reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size) +{ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + return reserve(&p->write_idx, ri + ei, num_packets); +} + +// Work group functions + +#define __WORK_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) size_t \ +__work_group_reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + __local size_t *t = (__local size_t *)__wg_scratch; \ + \ + if ((int)get_local_linear_id() == 0) { \ + size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + size_t rid = reserve(&p->read_idx, wi, num_packets); \ + \ + if (rid + num_packets == wi) { \ + atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \ + atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + } \ + \ + *t = rid; \ + } \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + \ + return *t; \ +} + +DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) size_t +__work_group_reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size) +{ + __local size_t *t = (__local size_t *)__wg_scratch; + + if ((int)get_local_linear_id() == 0) { + size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); + size_t rid = reserve(&p->read_idx, wi, num_packets); + + if (rid + num_packets == wi) { + atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); + atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + + *t = rid; + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + return *t; +} + +#define __WORK_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) size_t \ +__work_group_reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + __local size_t *t = (__local size_t *)__wg_scratch; \ + \ + if ((int)get_local_linear_id() == 0) { \ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + *t = reserve(&p->write_idx, ri + ei, num_packets); \ + } \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + \ + return *t; \ +} + +DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) size_t +__work_group_reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size) +{ + __local size_t *t = (__local size_t *)__wg_scratch; + + if ((int)get_local_linear_id() == 0) { + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + *t = reserve(&p->write_idx, ri + ei, num_packets); + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + return *t; +} + +// sub group functions + +#define __SUB_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) size_t \ +__sub_group_reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t rid = ~(size_t)0; \ + \ + if (get_sub_group_local_id() == 0) { \ + size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + rid = reserve(&p->read_idx, wi, num_packets); \ + \ + if (rid + num_packets == wi) { \ + atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \ + atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + } \ + } \ + \ + return sub_group_broadcast(rid, 0); \ +} + +DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) size_t +__sub_group_reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size) +{ + size_t rid = ~(size_t)0; + + if (get_sub_group_local_id() == 0) { + size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); + rid = reserve(&p->read_idx, wi, num_packets); + + if (rid + num_packets == wi) { + atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); + atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + } + + return sub_group_broadcast(rid, 0); +} + +#define __SUB_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) size_t \ +__sub_group_reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t rid = ~(size_t)0; \ + \ + if (get_sub_group_local_id() == 0) { \ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + rid = reserve(&p->write_idx, ri + ei, num_packets); \ + } \ + \ + return sub_group_broadcast(rid, 0); \ +} + +DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) size_t +__sub_group_reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size) +{ + size_t rid = ~(size_t)0; + + if (get_sub_group_local_id() == 0) { + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + rid = reserve(&p->write_idx, ri + ei, num_packets); + } + + return sub_group_broadcast(rid, 0); +} +
diff --git a/amd-builtins/pipes/validp.cl b/amd-builtins/pipes/validp.cl new file mode 100644 index 0000000..512b7d6 --- /dev/null +++ b/amd-builtins/pipes/validp.cl
@@ -0,0 +1,32 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +// +// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved. +// + + +__attribute__((always_inline, weak)) bool +__is_valid_reserve_id(size_t rid) +{ + return rid != ~(size_t)0; +} +
diff --git a/amd-builtins/pipes/writep.cl b/amd-builtins/pipes/writep.cl new file mode 100644 index 0000000..22cf6fb --- /dev/null +++ b/amd-builtins/pipes/writep.cl
@@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +// +// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved. +// + +#include "pipes.h" + +#define __WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) int \ +__write_pipe_internal_##SIZE(__global struct pipeimp* p, const STYPE* ptr) \ +{ \ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + size_t wi = reserve(&p->write_idx, ri+ei, 1); \ + if (wi == ~(size_t)0) \ + return -1; \ + \ + ((__global STYPE *)p->packets)[wi % ei] = *ptr; \ + return 0; \ +} + +DO_PIPE_INTERNAL_SIZE(__WRITE_PIPE_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) int +__write_pipe_internal_user(__global struct pipeimp* p, const void* ptr, size_t size, size_t align) +{ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + size_t wi = reserve(&p->write_idx, ri+ei, 1); + if (wi == ~(size_t)0) + return -1; + + __memcpy_internal_aligned(p->packets + (wi % ei)*size, ptr, size, align); + + return 0; +} + +#define __WRITE_PIPE_INDEXED_INTERNAL_SIZE(SIZE, STYPE) \ +__attribute__((weak, always_inline)) int \ +__write_pipe_reserved_internal_##SIZE(__global struct pipeimp* p, size_t rid, uint i, const STYPE* ptr) \ +{ \ + rid += i; \ + ((__global STYPE *)p->packets)[rid % p->end_idx] = *ptr; \ + return 0; \ +} + +DO_PIPE_INTERNAL_SIZE(__WRITE_PIPE_INDEXED_INTERNAL_SIZE) + +__attribute__((weak, always_inline)) int +__write_pipe_reserved_internal_user(__global struct pipeimp* p, size_t rid, uint i, const void *ptr, size_t size, size_t align) +{ + rid += i; + + __memcpy_internal_aligned(p->packets + (rid % p->end_idx)*size, ptr, size, align); + + return 0; +} +
diff --git a/amd-builtins/subgroup/subany.cl b/amd-builtins/subgroup/subany.cl new file mode 100644 index 0000000..5b76355 --- /dev/null +++ b/amd-builtins/subgroup/subany.cl
@@ -0,0 +1,43 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +extern __attribute__((pure)) uint __hsail_activelanecount_wavewidth_u32_b1(bool); + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) int +sub_group_all(int predicate) +{ + return __hsail_activelanecount_wavewidth_u32_b1(predicate != 0) == __hsail_activelanecount_wavewidth_u32_b1(true); +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) int +sub_group_any(int predicate) +{ + return __hsail_activelanecount_wavewidth_u32_b1(predicate != 0) != 0; +} +
diff --git a/amd-builtins/subgroup/subbar.cl b/amd-builtins/subgroup/subbar.cl new file mode 100644 index 0000000..9424af3 --- /dev/null +++ b/amd-builtins/subgroup/subbar.cl
@@ -0,0 +1,40 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +extern void __hsail_wavebarrier(void); + +__attribute__((overloadable,weak,always_inline)) void +sub_group_barrier(cl_mem_fence_flags flags) +{ + sub_group_barrier(flags, memory_scope_sub_group); +} + +__attribute__((overloadable,weak,always_inline)) void +sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope) +{ + // What about CLK_IMAGE_MEM_FENCE + atomic_work_item_fence(flags, memory_order_release, scope); + __hsail_wavebarrier(); + atomic_work_item_fence(flags, memory_order_acquire, scope); +} +
diff --git a/amd-builtins/subgroup/subbcast.cl b/amd-builtins/subgroup/subbcast.cl new file mode 100644 index 0000000..9adece6 --- /dev/null +++ b/amd-builtins/subgroup/subbcast.cl
@@ -0,0 +1,52 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival); +extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival); +extern void __hsail_wavebarrier(); + +__attribute__((always_inline)) static uint +bcast32(uint a, uint lid) +{ + a = __hsail_activelaneshuffle_wavewidth_b32(a, lid, 0U, false); + __hsail_wavebarrier(); + return a; +} + +extern __attribute__((overloadable, alias("bcast32"))) uint sub_group_broadcast(uint, uint); +extern __attribute__((overloadable, alias("bcast32"))) int sub_group_broadcast(int, uint); +extern __attribute__((overloadable, alias("bcast32"))) float sub_group_broadcast(float, uint); + + +__attribute__((always_inline)) static ulong +bcast64(ulong a, uint lid) +{ + a = __hsail_activelaneshuffle_wavewidth_b64(a, lid, 0UL, false); + __hsail_wavebarrier(); + return a; +} + +extern __attribute__((overloadable, alias("bcast64"))) ulong sub_group_broadcast(ulong, uint); +extern __attribute__((overloadable, alias("bcast64"))) long sub_group_broadcast(long, uint); +extern __attribute__((overloadable, alias("bcast64"))) double sub_group_broadcast(double, uint); +
diff --git a/amd-builtins/subgroup/subget.cl b/amd-builtins/subgroup/subget.cl new file mode 100644 index 0000000..ab74690 --- /dev/null +++ b/amd-builtins/subgroup/subget.cl
@@ -0,0 +1,84 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +extern __attribute__((pure)) uint __hsail_workitemid_flat(void); + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) uint +get_sub_group_size(void) +{ + uint wgs = (uint)get_local_size(0) * (uint)get_local_size(1) * (uint)get_local_size(2); + uint lid = (uint)get_local_linear_id(); + return min(64U, wgs - (lid & ~63U)); +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) uint +get_max_sub_group_size(void) +{ + uint wgs = (uint)get_enqueued_local_size(0) * get_enqueued_local_size(1) * get_enqueued_local_size(2); + return min(64U, wgs); +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) uint +get_num_sub_groups(void) +{ + uint wgs = (uint)get_local_size(0) * (uint)get_local_size(1) * (uint)get_local_size(2); + return (wgs + 63U) >> 6U; +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) uint +get_enqueued_num_sub_groups(void) +{ + uint wgs = (uint)get_enqueued_local_size(0) * get_enqueued_local_size(1) * get_enqueued_local_size(2); + return (wgs + 63U) >> 6U; +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) uint +get_sub_group_id(void) +{ + return __hsail_workitemid_flat() >> 6U; +} + +#ifdef __clang__ +__attribute__((overloadable)) +#endif +__attribute__((always_inline)) uint +get_sub_group_local_id(void) +{ + return __hsail_workitemid_flat() & 0x3fU; +} +
diff --git a/amd-builtins/subgroup/subreduce.cl b/amd-builtins/subgroup/subreduce.cl new file mode 100644 index 0000000..d706c3d --- /dev/null +++ b/amd-builtins/subgroup/subreduce.cl
@@ -0,0 +1,94 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __OPENCL_C_VERSION__ >= 200 + +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +extern uint __hsail_get_lane_id(void); +extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival); +extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival); +extern void __hsail_wavebarrier(); + +#define GENA(TY,SZ,AO,AI,Z) \ +__attribute__((overloadable, always_inline)) TY \ +sub_group_reduce_add(TY a) \ +{ \ + uint lid = __hsail_get_lane_id(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^1, AI(Z), false)); \ + __hsail_wavebarrier(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^2, AI(Z), false)); \ + __hsail_wavebarrier(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^4, AI(Z), false)); \ + __hsail_wavebarrier(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^8, AI(Z), false)); \ + __hsail_wavebarrier(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^16, AI(Z), false)); \ + __hsail_wavebarrier(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^32, AI(Z), false)); \ + __hsail_wavebarrier(); \ + return a; \ +} + +GENA(int,32,as_int,as_uint,0) +GENA(uint,32,,,0U) +GENA(long,64,as_long,as_ulong,0L) +GENA(ulong,64,,,0UL) +GENA(float,32,as_float,as_uint,0.0f) +GENA(double,64,as_double,as_ulong,0.0) + +#define GENO(TY,SZ,OP,AO,AI,ID) \ +__attribute__((overloadable, always_inline)) TY \ +sub_group_reduce_##OP(TY a) \ +{ \ + uint lid = __hsail_get_lane_id(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^1, AI(ID), false))); \ + __hsail_wavebarrier(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^2, AI(ID), false))); \ + __hsail_wavebarrier(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^4, AI(ID), false))); \ + __hsail_wavebarrier(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^8, AI(ID), false))); \ + __hsail_wavebarrier(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^16, AI(ID), false))); \ + __hsail_wavebarrier(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^32, AI(ID), false))); \ + __hsail_wavebarrier(); \ + return a; \ +} + +GENO(int,32,min,as_int,as_uint,INT_MAX) +GENO(uint,32,min,,,UINT_MAX) +GENO(long,64,min,as_long,as_ulong,LONG_MAX) +GENO(ulong,64,min,,,ULONG_MAX) +GENO(float,32,min,as_float,as_uint,INFINITY) +GENO(double,64,min,as_double,as_ulong,(double)INFINITY) + +GENO(int,32,max,as_int,as_uint,INT_MIN) +GENO(uint,32,max,,,0U) +GENO(long,64,max,as_long,as_ulong,LONG_MIN) +GENO(ulong,64,max,,,0UL) +GENO(float,32,max,as_float,as_uint,-INFINITY) +GENO(double,64,max,as_double,as_ulong,-(double)INFINITY) + +#endif +
diff --git a/amd-builtins/subgroup/subscan.cl b/amd-builtins/subgroup/subscan.cl new file mode 100644 index 0000000..f0cddb1 --- /dev/null +++ b/amd-builtins/subgroup/subscan.cl
@@ -0,0 +1,130 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +extern uint __hsail_get_lane_id(void); +extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival); +extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival); +extern void __hsail_wavebarrier(); + +// Define exclusive in terms of inclusive + +#define EGEN(TY,OP,SZ,AO,AI,ID) \ +__attribute__((overloadable, always_inline)) TY \ +sub_group_scan_exclusive_##OP(TY a) \ +{ \ + a = sub_group_scan_inclusive_##OP(a); \ + uint lid = __hsail_get_lane_id(); \ + a = AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1)); \ + return a; \ +} + +EGEN(int,add,32,as_int,as_uint,0) +EGEN(int,min,32,as_int,as_uint,INT_MAX) +EGEN(int,max,32,as_int,as_uint,INT_MIN) + +EGEN(uint,add,32,,,0) +EGEN(uint,min,32,,,UINT_MAX) +EGEN(uint,max,32,,,0U) + +EGEN(long,add,64,as_long,as_ulong,0L) +EGEN(long,min,64,as_long,as_ulong,LONG_MAX) +EGEN(long,max,64,as_long,as_ulong,LONG_MIN) + +EGEN(ulong,add,64,,,0UL) +EGEN(ulong,min,64,,,ULONG_MAX) +EGEN(ulong,max,64,,,0UL) + +EGEN(float,add,32,as_float,as_uint,0.0f) +EGEN(float,min,32,as_float,as_uint,INFINITY) +EGEN(float,max,32,as_float,as_uint,-INFINITY) + +EGEN(double,add,64,as_double,as_ulong,0.0) +EGEN(double,min,64,as_double,as_ulong,(double)INFINITY) +EGEN(double,max,64,as_double,as_ulong,-(double)INFINITY) + +// Now inclusive scan + +#define IGENA(TY,SZ,AO,AI,ID) \ +__attribute__((overloadable, always_inline)) TY \ +sub_group_scan_inclusive_add(TY a) \ +{ \ + uint lid = __hsail_get_lane_id(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1)); \ + __hsail_wavebarrier(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-2)&0x3f, AI(ID), lid < 2)); \ + __hsail_wavebarrier(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-4)&0x3f, AI(ID), lid < 4)); \ + __hsail_wavebarrier(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-8)&0x3f, AI(ID), lid < 8)); \ + __hsail_wavebarrier(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-16)&0x3f, AI(ID), lid < 16)); \ + __hsail_wavebarrier(); \ + a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-32)&0x3f, AI(ID), lid < 32)); \ + __hsail_wavebarrier(); \ + return a; \ +} + +#define IGENO(TY,SZ,OP,AO,AI,ID) \ +__attribute__((overloadable, always_inline)) TY \ +sub_group_scan_inclusive_##OP(TY a) \ +{ \ + uint lid = __hsail_get_lane_id(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1))); \ + __hsail_wavebarrier(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-2)&0x3f, AI(ID), lid < 2))); \ + __hsail_wavebarrier(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-4)&0x3f, AI(ID), lid < 4))); \ + __hsail_wavebarrier(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-8)&0x3f, AI(ID), lid < 8))); \ + __hsail_wavebarrier(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-16)&0x3f, AI(ID), lid < 16))); \ + __hsail_wavebarrier(); \ + a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-32)&0x3f, AI(ID), lid < 32))); \ + __hsail_wavebarrier(); \ + return a; \ +} + +IGENA(int,32,as_int,as_uint,0) +IGENO(int,32,min,as_int,as_uint,INT_MAX) +IGENO(int,32,max,as_int,as_uint,INT_MIN) + +IGENA(uint,32,,,0U) +IGENO(uint,32,min,,,UINT_MAX) +IGENO(uint,32,max,,,0U) + +IGENA(long,64,as_long,as_ulong,0L) +IGENO(long,64,min,as_long,as_ulong,LONG_MAX) +IGENO(long,64,max,as_long,as_ulong,LONG_MIN) + +IGENA(ulong,64,,,0UL) +IGENO(ulong,64,min,,,ULONG_MAX) +IGENO(ulong,64,max,,,0UL) + +IGENA(float,32,as_float,as_uint,0.0f) +IGENO(float,32,min,as_float,as_uint,INFINITY) +IGENO(float,32,max,as_float,as_uint,-INFINITY) + +IGENA(double,64,as_double,as_ulong,0.0) +IGENO(double,64,min,as_double,as_ulong,(double)INFINITY) +IGENO(double,64,max,as_double,as_ulong,-(double)INFINITY) +
diff --git a/amd-builtins/vldst/f16_f32.cl b/amd-builtins/vldst/f16_f32.cl new file mode 100644 index 0000000..d4fddd5 --- /dev/null +++ b/amd-builtins/vldst/f16_f32.cl
@@ -0,0 +1,330 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +extern float __amdil_half_to_float_f32(uint op1); + +extern float __amdil_float_to_half_f32(float op1); +extern float __amdil_float_to_half_near_f32(float op1); +extern float __amdil_float_to_half_neg_inf_f32(float op1); +extern float __amdil_float_to_half_plus_inf_f32(float op1); + +// half -> float +__attribute__((always_inline)) float +__cvt_f16_to_f32(ushort a) +{ + return __amdil_half_to_float_f32((uint)a); +} + +__attribute__((always_inline)) float2 +__cvt_2f16_to_2f32(ushort2 ush) +{ + float2 ret; + ret.s0 = __cvt_f16_to_f32(ush.s0); + ret.s1 = __cvt_f16_to_f32(ush.s1); + return ret; +} + +__attribute__((always_inline)) float3 +__cvt_3f16_to_3f32(ushort3 ush) +{ + float3 ret; + ret.lo = __cvt_2f16_to_2f32(ush.lo); + ret.s2 = __cvt_f16_to_f32(ush.s2); + return ret; +} + +__attribute__((always_inline)) float4 +__cvt_4f16_to_4f32(ushort4 ush) +{ + float4 ret; + ret.lo = __cvt_2f16_to_2f32(ush.lo); + ret.hi = __cvt_2f16_to_2f32(ush.hi); + return ret; +} + +__attribute__((always_inline)) float8 +__cvt_8f16_to_8f32(ushort8 ush) +{ + float8 ret; + ret.lo = __cvt_4f16_to_4f32(ush.lo); + ret.hi = __cvt_4f16_to_4f32(ush.hi); + return ret; +} + +__attribute__((always_inline)) float16 +__cvt_16f16_to_16f32(ushort16 ush) +{ + float16 ret; + ret.lo = __cvt_8f16_to_8f32(ush.lo); + ret.hi = __cvt_8f16_to_8f32(ush.hi); + return ret; +} + +// float -> half rte +__attribute__((always_inline)) ushort +__cvt_f32_to_f16_rte(float a) +{ + return (ushort)as_uint(__amdil_float_to_half_near_f32(a)); +} + +__attribute__((always_inline)) ushort2 +__cvt_2f32_to_2f16_rte(float2 f) +{ + ushort2 ret; + ret.s0 = __cvt_f32_to_f16_rte(f.s0); + ret.s1 = __cvt_f32_to_f16_rte(f.s1); + return ret; +} + +__attribute__((always_inline)) ushort3 +__cvt_3f32_to_3f16_rte(float3 f) +{ + ushort3 ret; + ret.lo = __cvt_2f32_to_2f16_rte(f.lo); + ret.s2 = __cvt_f32_to_f16_rte(f.s2); + return ret; +} + +__attribute__((always_inline)) ushort4 +__cvt_4f32_to_4f16_rte(float4 f) +{ + ushort4 ret; + ret.lo = __cvt_2f32_to_2f16_rte(f.lo); + ret.hi = __cvt_2f32_to_2f16_rte(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort8 +__cvt_8f32_to_8f16_rte(float8 f) +{ + ushort8 ret; + ret.lo = __cvt_4f32_to_4f16_rte(f.lo); + ret.hi = __cvt_4f32_to_4f16_rte(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort16 +__cvt_16f32_to_16f16_rte(float16 f) +{ + ushort16 ret; + ret.lo = __cvt_8f32_to_8f16_rte(f.lo); + ret.hi = __cvt_8f32_to_8f16_rte(f.hi); + return ret; +} + +// float -> half cur +// XXX assumes RTE +__attribute__((always_inline)) ushort +__cvt_f32_to_f16_cur(float f) +{ + return __cvt_f32_to_f16_rte(f); +} + +__attribute__((always_inline)) ushort2 +__cvt_2f32_to_2f16_cur(float2 f) +{ + return __cvt_2f32_to_2f16_rte(f); +} + +__attribute__((always_inline)) ushort3 +__cvt_3f32_to_3f16_cur(float3 f) +{ + return __cvt_3f32_to_3f16_rte(f); +} + +__attribute__((always_inline)) ushort4 +__cvt_4f32_to_4f16_cur(float4 f) +{ + return __cvt_4f32_to_4f16_rte(f); +} + +__attribute__((always_inline)) ushort8 +__cvt_8f32_to_8f16_cur(float8 f) +{ + return __cvt_8f32_to_8f16_rte(f); +} + +__attribute__((always_inline)) ushort16 +__cvt_16f32_to_16f16_cur(float16 f) +{ + return __cvt_16f32_to_16f16_rte(f); +} + +//float -> half rtp + +ushort +__cvt_f32_to_f16_rtp(float a) +{ + return (ushort)as_uint(__amdil_float_to_half_plus_inf_f32(a)); +} + +__attribute__((always_inline)) ushort2 +__cvt_2f32_to_2f16_rtp(float2 f) +{ + ushort2 ret; + ret.s0 = __cvt_f32_to_f16_rtp(f.s0); + ret.s1 = __cvt_f32_to_f16_rtp(f.s1); + return ret; +} + +__attribute__((always_inline)) ushort3 +__cvt_3f32_to_3f16_rtp(float3 f) +{ + ushort3 ret; + ret.lo = __cvt_2f32_to_2f16_rtp(f.lo); + ret.s2 = __cvt_f32_to_f16_rtp(f.s2); + return ret; +} + +__attribute__((always_inline)) ushort4 +__cvt_4f32_to_4f16_rtp(float4 f) +{ + ushort4 ret; + ret.lo = __cvt_2f32_to_2f16_rtp(f.lo); + ret.hi = __cvt_2f32_to_2f16_rtp(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort8 +__cvt_8f32_to_8f16_rtp(float8 f) +{ + ushort8 ret; + ret.lo = __cvt_4f32_to_4f16_rtp(f.lo); + ret.hi = __cvt_4f32_to_4f16_rtp(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort16 +__cvt_16f32_to_16f16_rtp(float16 f) +{ + ushort16 ret; + ret.lo = __cvt_8f32_to_8f16_rtp(f.lo); + ret.hi = __cvt_8f32_to_8f16_rtp(f.hi); + return ret; +} + +// float -> half rtn + +ushort +__cvt_f32_to_f16_rtn(float a) +{ + return (ushort)as_uint(__amdil_float_to_half_neg_inf_f32(a)); +} + +__attribute__((always_inline)) ushort2 +__cvt_2f32_to_2f16_rtn(float2 f) +{ + ushort2 ret; + ret.s0 = __cvt_f32_to_f16_rtn(f.s0); + ret.s1 = __cvt_f32_to_f16_rtn(f.s1); + return ret; +} + +__attribute__((always_inline)) ushort3 +__cvt_3f32_to_3f16_rtn(float3 f) +{ + ushort3 ret; + ret.lo = __cvt_2f32_to_2f16_rtn(f.lo); + ret.s2 = __cvt_f32_to_f16_rtn(f.s2); + return ret; +} + +__attribute__((always_inline)) ushort4 +__cvt_4f32_to_4f16_rtn(float4 f) +{ + ushort4 ret; + ret.lo = __cvt_2f32_to_2f16_rtn(f.lo); + ret.hi = __cvt_2f32_to_2f16_rtn(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort8 +__cvt_8f32_to_8f16_rtn(float8 f) +{ + ushort8 ret; + ret.lo = __cvt_4f32_to_4f16_rtn(f.lo); + ret.hi = __cvt_4f32_to_4f16_rtn(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort16 +__cvt_16f32_to_16f16_rtn(float16 f) +{ + ushort16 ret; + ret.lo = __cvt_8f32_to_8f16_rtn(f.lo); + ret.hi = __cvt_8f32_to_8f16_rtn(f.hi); + return ret; +} + +// float -> half rtz + +ushort +__cvt_f32_to_f16_rtz(float a) +{ + return (ushort)as_uint(__amdil_float_to_half_f32(a)); +} + +__attribute__((always_inline)) ushort2 +__cvt_2f32_to_2f16_rtz(float2 f) +{ + ushort2 ret; + ret.s0 = __cvt_f32_to_f16_rtz(f.s0); + ret.s1 = __cvt_f32_to_f16_rtz(f.s1); + return ret; +} + +__attribute__((always_inline)) ushort3 +__cvt_3f32_to_3f16_rtz(float3 f) +{ + ushort3 ret; + ret.lo = __cvt_2f32_to_2f16_rtz(f.lo); + ret.s2 = __cvt_f32_to_f16_rtz(f.s2); + return ret; +} + +__attribute__((always_inline)) ushort4 +__cvt_4f32_to_4f16_rtz(float4 f) +{ + ushort4 ret; + ret.lo = __cvt_2f32_to_2f16_rtz(f.lo); + ret.hi = __cvt_2f32_to_2f16_rtz(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort8 +__cvt_8f32_to_8f16_rtz(float8 f) +{ + ushort8 ret; + ret.lo = __cvt_4f32_to_4f16_rtz(f.lo); + ret.hi = __cvt_4f32_to_4f16_rtz(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort16 +__cvt_16f32_to_16f16_rtz(float16 f) +{ + ushort16 ret; + ret.lo = __cvt_8f32_to_8f16_rtz(f.lo); + ret.hi = __cvt_8f32_to_8f16_rtz(f.hi); + return ret; +} +
diff --git a/amd-builtins/vldst/f64_f16.cl b/amd-builtins/vldst/f64_f16.cl new file mode 100644 index 0000000..d603d8d --- /dev/null +++ b/amd-builtins/vldst/f64_f16.cl
@@ -0,0 +1,276 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +extern float __amdil_double_to_half_f64(double op1); +extern float __amdil_double_to_half_near_f64(double op1); +extern float __amdil_double_to_half_neg_inf_f64(double op1); +extern float __amdil_double_to_half_plus_inf_f64(double op1); + +// double -> half rte +__attribute__((always_inline)) ushort +__cvt_f64_to_f16_rte(double a) +{ + return (ushort)as_uint(__amdil_double_to_half_near_f64(a)); +} + +__attribute__((always_inline)) ushort2 +__cvt_2f64_to_2f16_rte(double2 f) +{ + ushort2 ret; + ret.s0 = __cvt_f64_to_f16_rte(f.s0); + ret.s1 = __cvt_f64_to_f16_rte(f.s1); + return ret; +} + +__attribute__((always_inline)) ushort3 +__cvt_3f64_to_3f16_rte(double3 f) +{ + ushort3 ret; + ret.lo = __cvt_2f64_to_2f16_rte(f.lo); + ret.s2 = __cvt_f64_to_f16_rte(f.s2); + return ret; +} + +__attribute__((always_inline)) ushort4 +__cvt_4f64_to_4f16_rte(double4 f) +{ + ushort4 ret; + ret.lo = __cvt_2f64_to_2f16_rte(f.lo); + ret.hi = __cvt_2f64_to_2f16_rte(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort8 +__cvt_8f64_to_8f16_rte(double8 f) +{ + ushort8 ret; + ret.lo = __cvt_4f64_to_4f16_rte(f.lo); + ret.hi = __cvt_4f64_to_4f16_rte(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort16 +__cvt_16f64_to_16f16_rte(double16 f) +{ + ushort16 ret; + ret.lo = __cvt_8f64_to_8f16_rte(f.lo); + ret.hi = __cvt_8f64_to_8f16_rte(f.hi); + return ret; +} + +// double -> half cur +// XXX assumes RTE +__attribute__((always_inline)) ushort +__cvt_f64_to_f16_cur(double f) +{ + return __cvt_f64_to_f16_rte(f); +} + +__attribute__((always_inline)) ushort2 +__cvt_2f64_to_2f16_cur(double2 f) +{ + return __cvt_2f64_to_2f16_rte(f); +} + +__attribute__((always_inline)) ushort3 +__cvt_3f64_to_3f16_cur(double3 f) +{ + return __cvt_3f64_to_3f16_rte(f); +} + +__attribute__((always_inline)) ushort4 +__cvt_4f64_to_4f16_cur(double4 f) +{ + return __cvt_4f64_to_4f16_rte(f); +} + +__attribute__((always_inline)) ushort8 +__cvt_8f64_to_8f16_cur(double8 f) +{ + return __cvt_8f64_to_8f16_rte(f); +} + +__attribute__((always_inline)) ushort16 +__cvt_16f64_to_16f16_cur(double16 f) +{ + return __cvt_16f64_to_16f16_rte(f); +} + +//double -> half rtp + +ushort +__cvt_f64_to_f16_rtp(double a) +{ + return (ushort)as_uint(__amdil_double_to_half_plus_inf_f64(a)); +} + +__attribute__((always_inline)) ushort2 +__cvt_2f64_to_2f16_rtp(double2 f) +{ + ushort2 ret; + ret.s0 = __cvt_f64_to_f16_rtp(f.s0); + ret.s1 = __cvt_f64_to_f16_rtp(f.s1); + return ret; +} + +__attribute__((always_inline)) ushort3 +__cvt_3f64_to_3f16_rtp(double3 f) +{ + ushort3 ret; + ret.lo = __cvt_2f64_to_2f16_rtp(f.lo); + ret.s2 = __cvt_f64_to_f16_rtp(f.s2); + return ret; +} + +__attribute__((always_inline)) ushort4 +__cvt_4f64_to_4f16_rtp(double4 f) +{ + ushort4 ret; + ret.lo = __cvt_2f64_to_2f16_rtp(f.lo); + ret.hi = __cvt_2f64_to_2f16_rtp(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort8 +__cvt_8f64_to_8f16_rtp(double8 f) +{ + ushort8 ret; + ret.lo = __cvt_4f64_to_4f16_rtp(f.lo); + ret.hi = __cvt_4f64_to_4f16_rtp(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort16 +__cvt_16f64_to_16f16_rtp(double16 f) +{ + ushort16 ret; + ret.lo = __cvt_8f64_to_8f16_rtp(f.lo); + ret.hi = __cvt_8f64_to_8f16_rtp(f.hi); + return ret; +} + +// double -> half rtn + +ushort +__cvt_f64_to_f16_rtn(double a) +{ + return (ushort)as_uint(__amdil_double_to_half_neg_inf_f64(a)); +} + +__attribute__((always_inline)) ushort2 +__cvt_2f64_to_2f16_rtn(double2 f) +{ + ushort2 ret; + ret.s0 = __cvt_f64_to_f16_rtn(f.s0); + ret.s1 = __cvt_f64_to_f16_rtn(f.s1); + return ret; +} + +__attribute__((always_inline)) ushort3 +__cvt_3f64_to_3f16_rtn(double3 f) +{ + ushort3 ret; + ret.lo = __cvt_2f64_to_2f16_rtn(f.lo); + ret.s2 = __cvt_f64_to_f16_rtn(f.s2); + return ret; +} + +__attribute__((always_inline)) ushort4 +__cvt_4f64_to_4f16_rtn(double4 f) +{ + ushort4 ret; + ret.lo = __cvt_2f64_to_2f16_rtn(f.lo); + ret.hi = __cvt_2f64_to_2f16_rtn(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort8 +__cvt_8f64_to_8f16_rtn(double8 f) +{ + ushort8 ret; + ret.lo = __cvt_4f64_to_4f16_rtn(f.lo); + ret.hi = __cvt_4f64_to_4f16_rtn(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort16 +__cvt_16f64_to_16f16_rtn(double16 f) +{ + ushort16 ret; + ret.lo = __cvt_8f64_to_8f16_rtn(f.lo); + ret.hi = __cvt_8f64_to_8f16_rtn(f.hi); + return ret; +} + +// double -> half rtz + +ushort +__cvt_f64_to_f16_rtz(double a) +{ + return (ushort)as_uint(__amdil_double_to_half_f64(a)); +} + +__attribute__((always_inline)) ushort2 +__cvt_2f64_to_2f16_rtz(double2 f) +{ + ushort2 ret; + ret.s0 = __cvt_f64_to_f16_rtz(f.s0); + ret.s1 = __cvt_f64_to_f16_rtz(f.s1); + return ret; +} + +__attribute__((always_inline)) ushort3 +__cvt_3f64_to_3f16_rtz(double3 f) +{ + ushort3 ret; + ret.lo = __cvt_2f64_to_2f16_rtz(f.lo); + ret.s2 = __cvt_f64_to_f16_rtz(f.s2); + return ret; +} + +__attribute__((always_inline)) ushort4 +__cvt_4f64_to_4f16_rtz(double4 f) +{ + ushort4 ret; + ret.lo = __cvt_2f64_to_2f16_rtz(f.lo); + ret.hi = __cvt_2f64_to_2f16_rtz(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort8 +__cvt_8f64_to_8f16_rtz(double8 f) +{ + ushort8 ret; + ret.lo = __cvt_4f64_to_4f16_rtz(f.lo); + ret.hi = __cvt_4f64_to_4f16_rtz(f.hi); + return ret; +} + +__attribute__((always_inline)) ushort16 +__cvt_16f64_to_16f16_rtz(double16 f) +{ + ushort16 ret; + ret.lo = __cvt_8f64_to_8f16_rtz(f.lo); + ret.hi = __cvt_8f64_to_8f16_rtz(f.hi); + return ret; +} +
diff --git a/amd-builtins/vldst/vldst_gen.cl b/amd-builtins/vldst/vldst_gen.cl new file mode 100644 index 0000000..7d1f4ae --- /dev/null +++ b/amd-builtins/vldst/vldst_gen.cl
@@ -0,0 +1,3206 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + + +__attribute__((overloadable, always_inline, weak)) float2 +vload2(size_t i, const float *p) +{ + return as_float2(vload2(i, (const int *)p)); +} + + + +__attribute__((overloadable, always_inline, weak)) float2 +vload2(size_t i, const __constant float *p) +{ + return as_float2(vload2(i, (const __constant int *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) float2 +vload2(size_t i, const __global float *p) +{ + return as_float2(vload2(i, (const __global int *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) float2 +vload2(size_t i, const __local float *p) +{ + return as_float2(vload2(i, (const __local int *)p)); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) double2 +vload2(size_t i, const double *p) +{ + return as_double2(vload2(i, (const long *)p)); +} + + + +__attribute__((overloadable, always_inline, weak)) double2 +vload2(size_t i, const __constant double *p) +{ + return as_double2(vload2(i, (const __constant long *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) double2 +vload2(size_t i, const __global double *p) +{ + return as_double2(vload2(i, (const __global long *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) double2 +vload2(size_t i, const __local double *p) +{ + return as_double2(vload2(i, (const __local long *)p)); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) float3 +vload3(size_t i, const float *p) +{ + return as_float3(vload3(i, (const int *)p)); +} + + + +__attribute__((overloadable, always_inline, weak)) float3 +vload3(size_t i, const __constant float *p) +{ + return as_float3(vload3(i, (const __constant int *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) float3 +vload3(size_t i, const __global float *p) +{ + return as_float3(vload3(i, (const __global int *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) float3 +vload3(size_t i, const __local float *p) +{ + return as_float3(vload3(i, (const __local int *)p)); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) double3 +vload3(size_t i, const double *p) +{ + return as_double3(vload3(i, (const long *)p)); +} + + + +__attribute__((overloadable, always_inline, weak)) double3 +vload3(size_t i, const __constant double *p) +{ + return as_double3(vload3(i, (const __constant long *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) double3 +vload3(size_t i, const __global double *p) +{ + return as_double3(vload3(i, (const __global long *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) double3 +vload3(size_t i, const __local double *p) +{ + return as_double3(vload3(i, (const __local long *)p)); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) float4 +vload4(size_t i, const float *p) +{ + return as_float4(vload4(i, (const int *)p)); +} + + + +__attribute__((overloadable, always_inline, weak)) float4 +vload4(size_t i, const __constant float *p) +{ + return as_float4(vload4(i, (const __constant int *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) float4 +vload4(size_t i, const __global float *p) +{ + return as_float4(vload4(i, (const __global int *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) float4 +vload4(size_t i, const __local float *p) +{ + return as_float4(vload4(i, (const __local int *)p)); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) double4 +vload4(size_t i, const double *p) +{ + return as_double4(vload4(i, (const long *)p)); +} + + + +__attribute__((overloadable, always_inline, weak)) double4 +vload4(size_t i, const __constant double *p) +{ + return as_double4(vload4(i, (const __constant long *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) double4 +vload4(size_t i, const __global double *p) +{ + return as_double4(vload4(i, (const __global long *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) double4 +vload4(size_t i, const __local double *p) +{ + return as_double4(vload4(i, (const __local long *)p)); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) float8 +vload8(size_t i, const float *p) +{ + return as_float8(vload8(i, (const int *)p)); +} + + + +__attribute__((overloadable, always_inline, weak)) float8 +vload8(size_t i, const __constant float *p) +{ + return as_float8(vload8(i, (const __constant int *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) float8 +vload8(size_t i, const __global float *p) +{ + return as_float8(vload8(i, (const __global int *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) float8 +vload8(size_t i, const __local float *p) +{ + return as_float8(vload8(i, (const __local int *)p)); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) double8 +vload8(size_t i, const double *p) +{ + return as_double8(vload8(i, (const long *)p)); +} + + + +__attribute__((overloadable, always_inline, weak)) double8 +vload8(size_t i, const __constant double *p) +{ + return as_double8(vload8(i, (const __constant long *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) double8 +vload8(size_t i, const __global double *p) +{ + return as_double8(vload8(i, (const __global long *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) double8 +vload8(size_t i, const __local double *p) +{ + return as_double8(vload8(i, (const __local long *)p)); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) float16 +vload16(size_t i, const float *p) +{ + return as_float16(vload16(i, (const int *)p)); +} + + + +__attribute__((overloadable, always_inline, weak)) float16 +vload16(size_t i, const __constant float *p) +{ + return as_float16(vload16(i, (const __constant int *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) float16 +vload16(size_t i, const __global float *p) +{ + return as_float16(vload16(i, (const __global int *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) float16 +vload16(size_t i, const __local float *p) +{ + return as_float16(vload16(i, (const __local int *)p)); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) double16 +vload16(size_t i, const double *p) +{ + return as_double16(vload16(i, (const long *)p)); +} + + + +__attribute__((overloadable, always_inline, weak)) double16 +vload16(size_t i, const __constant double *p) +{ + return as_double16(vload16(i, (const __constant long *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) double16 +vload16(size_t i, const __global double *p) +{ + return as_double16(vload16(i, (const __global long *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) double16 +vload16(size_t i, const __local double *p) +{ + return as_double16(vload16(i, (const __local long *)p)); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) void +vstore2(float2 v, size_t i, float *p) +{ + vstore2(as_int2(v), i, (int *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore2(float2 v, size_t i, __global float *p) +{ + vstore2(as_int2(v), i, (__global int *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore2(float2 v, size_t i, __local float *p) +{ + vstore2(as_int2(v), i, (__local int *)p); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) void +vstore2(double2 v, size_t i, double *p) +{ + vstore2(as_long2(v), i, (long *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore2(double2 v, size_t i, __global double *p) +{ + vstore2(as_long2(v), i, (__global long *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore2(double2 v, size_t i, __local double *p) +{ + vstore2(as_long2(v), i, (__local long *)p); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) void +vstore3(float3 v, size_t i, float *p) +{ + vstore3(as_int3(v), i, (int *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore3(float3 v, size_t i, __global float *p) +{ + vstore3(as_int3(v), i, (__global int *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore3(float3 v, size_t i, __local float *p) +{ + vstore3(as_int3(v), i, (__local int *)p); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) void +vstore3(double3 v, size_t i, double *p) +{ + vstore3(as_long3(v), i, (long *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore3(double3 v, size_t i, __global double *p) +{ + vstore3(as_long3(v), i, (__global long *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore3(double3 v, size_t i, __local double *p) +{ + vstore3(as_long3(v), i, (__local long *)p); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) void +vstore4(float4 v, size_t i, float *p) +{ + vstore4(as_int4(v), i, (int *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore4(float4 v, size_t i, __global float *p) +{ + vstore4(as_int4(v), i, (__global int *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore4(float4 v, size_t i, __local float *p) +{ + vstore4(as_int4(v), i, (__local int *)p); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) void +vstore4(double4 v, size_t i, double *p) +{ + vstore4(as_long4(v), i, (long *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore4(double4 v, size_t i, __global double *p) +{ + vstore4(as_long4(v), i, (__global long *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore4(double4 v, size_t i, __local double *p) +{ + vstore4(as_long4(v), i, (__local long *)p); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) void +vstore8(float8 v, size_t i, float *p) +{ + vstore8(as_int8(v), i, (int *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore8(float8 v, size_t i, __global float *p) +{ + vstore8(as_int8(v), i, (__global int *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore8(float8 v, size_t i, __local float *p) +{ + vstore8(as_int8(v), i, (__local int *)p); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) void +vstore8(double8 v, size_t i, double *p) +{ + vstore8(as_long8(v), i, (long *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore8(double8 v, size_t i, __global double *p) +{ + vstore8(as_long8(v), i, (__global long *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore8(double8 v, size_t i, __local double *p) +{ + vstore8(as_long8(v), i, (__local long *)p); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) void +vstore16(float16 v, size_t i, float *p) +{ + vstore16(as_int16(v), i, (int *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore16(float16 v, size_t i, __global float *p) +{ + vstore16(as_int16(v), i, (__global int *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore16(float16 v, size_t i, __local float *p) +{ + vstore16(as_int16(v), i, (__local int *)p); +} +#endif + + +__attribute__((overloadable, always_inline, weak)) void +vstore16(double16 v, size_t i, double *p) +{ + vstore16(as_long16(v), i, (long *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore16(double16 v, size_t i, __global double *p) +{ + vstore16(as_long16(v), i, (__global long *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((overloadable, always_inline, weak)) void +vstore16(double16 v, size_t i, __local double *p) +{ + vstore16(as_long16(v), i, (__local long *)p); +} +#endif + + +__attribute__((always_inline)) static char2 +vldp12(size_t i, const char *p) +{ + char2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp12"))) char2 vload2(size_t, const char *); +extern __attribute__((overloadable, weak, alias("vldp12"))) uchar2 vload2(size_t, const uchar *); + + + +__attribute__((always_inline)) static char2 +vldc12(size_t i, const __constant char *p) +{ + char2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc12"))) char2 vload2(size_t, const __constant char *); +extern __attribute__((overloadable, weak, alias("vldc12"))) uchar2 vload2(size_t, const __constant uchar *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static char2 +vldg12(size_t i, const __global char *p) +{ + char2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg12"))) char2 vload2(size_t, const __global char *); +extern __attribute__((overloadable, weak, alias("vldg12"))) uchar2 vload2(size_t, const __global uchar *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static char2 +vldl12(size_t i, const __local char *p) +{ + char2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl12"))) char2 vload2(size_t, const __local char *); +extern __attribute__((overloadable, weak, alias("vldl12"))) uchar2 vload2(size_t, const __local uchar *); +#endif + + +__attribute__((always_inline)) static short2 +vldp22(size_t i, const short *p) +{ + short2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp22"))) short2 vload2(size_t, const short *); +extern __attribute__((overloadable, weak, alias("vldp22"))) ushort2 vload2(size_t, const ushort *); + + + +__attribute__((always_inline)) static short2 +vldc22(size_t i, const __constant short *p) +{ + short2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc22"))) short2 vload2(size_t, const __constant short *); +extern __attribute__((overloadable, weak, alias("vldc22"))) ushort2 vload2(size_t, const __constant ushort *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static short2 +vldg22(size_t i, const __global short *p) +{ + short2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg22"))) short2 vload2(size_t, const __global short *); +extern __attribute__((overloadable, weak, alias("vldg22"))) ushort2 vload2(size_t, const __global ushort *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static short2 +vldl22(size_t i, const __local short *p) +{ + short2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl22"))) short2 vload2(size_t, const __local short *); +extern __attribute__((overloadable, weak, alias("vldl22"))) ushort2 vload2(size_t, const __local ushort *); +#endif + + +__attribute__((always_inline)) static int2 +vldp42(size_t i, const int *p) +{ + int2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp42"))) int2 vload2(size_t, const int *); +extern __attribute__((overloadable, weak, alias("vldp42"))) uint2 vload2(size_t, const uint *); + + + +__attribute__((always_inline)) static int2 +vldc42(size_t i, const __constant int *p) +{ + int2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc42"))) int2 vload2(size_t, const __constant int *); +extern __attribute__((overloadable, weak, alias("vldc42"))) uint2 vload2(size_t, const __constant uint *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static int2 +vldg42(size_t i, const __global int *p) +{ + int2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg42"))) int2 vload2(size_t, const __global int *); +extern __attribute__((overloadable, weak, alias("vldg42"))) uint2 vload2(size_t, const __global uint *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static int2 +vldl42(size_t i, const __local int *p) +{ + int2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl42"))) int2 vload2(size_t, const __local int *); +extern __attribute__((overloadable, weak, alias("vldl42"))) uint2 vload2(size_t, const __local uint *); +#endif + + +__attribute__((always_inline)) static long2 +vldp82(size_t i, const long *p) +{ + long2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp82"))) long2 vload2(size_t, const long *); +extern __attribute__((overloadable, weak, alias("vldp82"))) ulong2 vload2(size_t, const ulong *); + + + +__attribute__((always_inline)) static long2 +vldc82(size_t i, const __constant long *p) +{ + long2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc82"))) long2 vload2(size_t, const __constant long *); +extern __attribute__((overloadable, weak, alias("vldc82"))) ulong2 vload2(size_t, const __constant ulong *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static long2 +vldg82(size_t i, const __global long *p) +{ + long2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg82"))) long2 vload2(size_t, const __global long *); +extern __attribute__((overloadable, weak, alias("vldg82"))) ulong2 vload2(size_t, const __global ulong *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static long2 +vldl82(size_t i, const __local long *p) +{ + long2 ret; + p += i * 2; + ret.s0 = p[0]; + ret.s1 = p[1]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl82"))) long2 vload2(size_t, const __local long *); +extern __attribute__((overloadable, weak, alias("vldl82"))) ulong2 vload2(size_t, const __local ulong *); +#endif + + +__attribute__((always_inline)) static char3 +vldp13(size_t i, const char *p) +{ + char3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp13"))) char3 vload3(size_t, const char *); +extern __attribute__((overloadable, weak, alias("vldp13"))) uchar3 vload3(size_t, const uchar *); + + + +__attribute__((always_inline)) static char3 +vldc13(size_t i, const __constant char *p) +{ + char3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc13"))) char3 vload3(size_t, const __constant char *); +extern __attribute__((overloadable, weak, alias("vldc13"))) uchar3 vload3(size_t, const __constant uchar *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static char3 +vldg13(size_t i, const __global char *p) +{ + char3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg13"))) char3 vload3(size_t, const __global char *); +extern __attribute__((overloadable, weak, alias("vldg13"))) uchar3 vload3(size_t, const __global uchar *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static char3 +vldl13(size_t i, const __local char *p) +{ + char3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl13"))) char3 vload3(size_t, const __local char *); +extern __attribute__((overloadable, weak, alias("vldl13"))) uchar3 vload3(size_t, const __local uchar *); +#endif + + +__attribute__((always_inline)) static short3 +vldp23(size_t i, const short *p) +{ + short3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp23"))) short3 vload3(size_t, const short *); +extern __attribute__((overloadable, weak, alias("vldp23"))) ushort3 vload3(size_t, const ushort *); + + + +__attribute__((always_inline)) static short3 +vldc23(size_t i, const __constant short *p) +{ + short3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc23"))) short3 vload3(size_t, const __constant short *); +extern __attribute__((overloadable, weak, alias("vldc23"))) ushort3 vload3(size_t, const __constant ushort *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static short3 +vldg23(size_t i, const __global short *p) +{ + short3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg23"))) short3 vload3(size_t, const __global short *); +extern __attribute__((overloadable, weak, alias("vldg23"))) ushort3 vload3(size_t, const __global ushort *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static short3 +vldl23(size_t i, const __local short *p) +{ + short3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl23"))) short3 vload3(size_t, const __local short *); +extern __attribute__((overloadable, weak, alias("vldl23"))) ushort3 vload3(size_t, const __local ushort *); +#endif + + +__attribute__((always_inline)) static int3 +vldp43(size_t i, const int *p) +{ + int3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp43"))) int3 vload3(size_t, const int *); +extern __attribute__((overloadable, weak, alias("vldp43"))) uint3 vload3(size_t, const uint *); + + + +__attribute__((always_inline)) static int3 +vldc43(size_t i, const __constant int *p) +{ + int3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc43"))) int3 vload3(size_t, const __constant int *); +extern __attribute__((overloadable, weak, alias("vldc43"))) uint3 vload3(size_t, const __constant uint *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static int3 +vldg43(size_t i, const __global int *p) +{ + int3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg43"))) int3 vload3(size_t, const __global int *); +extern __attribute__((overloadable, weak, alias("vldg43"))) uint3 vload3(size_t, const __global uint *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static int3 +vldl43(size_t i, const __local int *p) +{ + int3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl43"))) int3 vload3(size_t, const __local int *); +extern __attribute__((overloadable, weak, alias("vldl43"))) uint3 vload3(size_t, const __local uint *); +#endif + + +__attribute__((always_inline)) static long3 +vldp83(size_t i, const long *p) +{ + long3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp83"))) long3 vload3(size_t, const long *); +extern __attribute__((overloadable, weak, alias("vldp83"))) ulong3 vload3(size_t, const ulong *); + + + +__attribute__((always_inline)) static long3 +vldc83(size_t i, const __constant long *p) +{ + long3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc83"))) long3 vload3(size_t, const __constant long *); +extern __attribute__((overloadable, weak, alias("vldc83"))) ulong3 vload3(size_t, const __constant ulong *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static long3 +vldg83(size_t i, const __global long *p) +{ + long3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg83"))) long3 vload3(size_t, const __global long *); +extern __attribute__((overloadable, weak, alias("vldg83"))) ulong3 vload3(size_t, const __global ulong *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static long3 +vldl83(size_t i, const __local long *p) +{ + long3 ret; + p += i * 3; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl83"))) long3 vload3(size_t, const __local long *); +extern __attribute__((overloadable, weak, alias("vldl83"))) ulong3 vload3(size_t, const __local ulong *); +#endif + + +__attribute__((always_inline)) static char4 +vldp14(size_t i, const char *p) +{ + char4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp14"))) char4 vload4(size_t, const char *); +extern __attribute__((overloadable, weak, alias("vldp14"))) uchar4 vload4(size_t, const uchar *); + + + +__attribute__((always_inline)) static char4 +vldc14(size_t i, const __constant char *p) +{ + char4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc14"))) char4 vload4(size_t, const __constant char *); +extern __attribute__((overloadable, weak, alias("vldc14"))) uchar4 vload4(size_t, const __constant uchar *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static char4 +vldg14(size_t i, const __global char *p) +{ + char4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg14"))) char4 vload4(size_t, const __global char *); +extern __attribute__((overloadable, weak, alias("vldg14"))) uchar4 vload4(size_t, const __global uchar *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static char4 +vldl14(size_t i, const __local char *p) +{ + char4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl14"))) char4 vload4(size_t, const __local char *); +extern __attribute__((overloadable, weak, alias("vldl14"))) uchar4 vload4(size_t, const __local uchar *); +#endif + + +__attribute__((always_inline)) static short4 +vldp24(size_t i, const short *p) +{ + short4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp24"))) short4 vload4(size_t, const short *); +extern __attribute__((overloadable, weak, alias("vldp24"))) ushort4 vload4(size_t, const ushort *); + + + +__attribute__((always_inline)) static short4 +vldc24(size_t i, const __constant short *p) +{ + short4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc24"))) short4 vload4(size_t, const __constant short *); +extern __attribute__((overloadable, weak, alias("vldc24"))) ushort4 vload4(size_t, const __constant ushort *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static short4 +vldg24(size_t i, const __global short *p) +{ + short4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg24"))) short4 vload4(size_t, const __global short *); +extern __attribute__((overloadable, weak, alias("vldg24"))) ushort4 vload4(size_t, const __global ushort *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static short4 +vldl24(size_t i, const __local short *p) +{ + short4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl24"))) short4 vload4(size_t, const __local short *); +extern __attribute__((overloadable, weak, alias("vldl24"))) ushort4 vload4(size_t, const __local ushort *); +#endif + + +__attribute__((always_inline)) static int4 +vldp44(size_t i, const int *p) +{ + int4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp44"))) int4 vload4(size_t, const int *); +extern __attribute__((overloadable, weak, alias("vldp44"))) uint4 vload4(size_t, const uint *); + + + +__attribute__((always_inline)) static int4 +vldc44(size_t i, const __constant int *p) +{ + int4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc44"))) int4 vload4(size_t, const __constant int *); +extern __attribute__((overloadable, weak, alias("vldc44"))) uint4 vload4(size_t, const __constant uint *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static int4 +vldg44(size_t i, const __global int *p) +{ + int4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg44"))) int4 vload4(size_t, const __global int *); +extern __attribute__((overloadable, weak, alias("vldg44"))) uint4 vload4(size_t, const __global uint *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static int4 +vldl44(size_t i, const __local int *p) +{ + int4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl44"))) int4 vload4(size_t, const __local int *); +extern __attribute__((overloadable, weak, alias("vldl44"))) uint4 vload4(size_t, const __local uint *); +#endif + + +__attribute__((always_inline)) static long4 +vldp84(size_t i, const long *p) +{ + long4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp84"))) long4 vload4(size_t, const long *); +extern __attribute__((overloadable, weak, alias("vldp84"))) ulong4 vload4(size_t, const ulong *); + + + +__attribute__((always_inline)) static long4 +vldc84(size_t i, const __constant long *p) +{ + long4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc84"))) long4 vload4(size_t, const __constant long *); +extern __attribute__((overloadable, weak, alias("vldc84"))) ulong4 vload4(size_t, const __constant ulong *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static long4 +vldg84(size_t i, const __global long *p) +{ + long4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg84"))) long4 vload4(size_t, const __global long *); +extern __attribute__((overloadable, weak, alias("vldg84"))) ulong4 vload4(size_t, const __global ulong *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static long4 +vldl84(size_t i, const __local long *p) +{ + long4 ret; + p += i * 4; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl84"))) long4 vload4(size_t, const __local long *); +extern __attribute__((overloadable, weak, alias("vldl84"))) ulong4 vload4(size_t, const __local ulong *); +#endif + + +__attribute__((always_inline)) static char8 +vldp18(size_t i, const char *p) +{ + char8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp18"))) char8 vload8(size_t, const char *); +extern __attribute__((overloadable, weak, alias("vldp18"))) uchar8 vload8(size_t, const uchar *); + + + +__attribute__((always_inline)) static char8 +vldc18(size_t i, const __constant char *p) +{ + char8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc18"))) char8 vload8(size_t, const __constant char *); +extern __attribute__((overloadable, weak, alias("vldc18"))) uchar8 vload8(size_t, const __constant uchar *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static char8 +vldg18(size_t i, const __global char *p) +{ + char8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg18"))) char8 vload8(size_t, const __global char *); +extern __attribute__((overloadable, weak, alias("vldg18"))) uchar8 vload8(size_t, const __global uchar *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static char8 +vldl18(size_t i, const __local char *p) +{ + char8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl18"))) char8 vload8(size_t, const __local char *); +extern __attribute__((overloadable, weak, alias("vldl18"))) uchar8 vload8(size_t, const __local uchar *); +#endif + + +__attribute__((always_inline)) static short8 +vldp28(size_t i, const short *p) +{ + short8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp28"))) short8 vload8(size_t, const short *); +extern __attribute__((overloadable, weak, alias("vldp28"))) ushort8 vload8(size_t, const ushort *); + + + +__attribute__((always_inline)) static short8 +vldc28(size_t i, const __constant short *p) +{ + short8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc28"))) short8 vload8(size_t, const __constant short *); +extern __attribute__((overloadable, weak, alias("vldc28"))) ushort8 vload8(size_t, const __constant ushort *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static short8 +vldg28(size_t i, const __global short *p) +{ + short8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg28"))) short8 vload8(size_t, const __global short *); +extern __attribute__((overloadable, weak, alias("vldg28"))) ushort8 vload8(size_t, const __global ushort *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static short8 +vldl28(size_t i, const __local short *p) +{ + short8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl28"))) short8 vload8(size_t, const __local short *); +extern __attribute__((overloadable, weak, alias("vldl28"))) ushort8 vload8(size_t, const __local ushort *); +#endif + + +__attribute__((always_inline)) static int8 +vldp48(size_t i, const int *p) +{ + int8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp48"))) int8 vload8(size_t, const int *); +extern __attribute__((overloadable, weak, alias("vldp48"))) uint8 vload8(size_t, const uint *); + + + +__attribute__((always_inline)) static int8 +vldc48(size_t i, const __constant int *p) +{ + int8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc48"))) int8 vload8(size_t, const __constant int *); +extern __attribute__((overloadable, weak, alias("vldc48"))) uint8 vload8(size_t, const __constant uint *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static int8 +vldg48(size_t i, const __global int *p) +{ + int8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg48"))) int8 vload8(size_t, const __global int *); +extern __attribute__((overloadable, weak, alias("vldg48"))) uint8 vload8(size_t, const __global uint *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static int8 +vldl48(size_t i, const __local int *p) +{ + int8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl48"))) int8 vload8(size_t, const __local int *); +extern __attribute__((overloadable, weak, alias("vldl48"))) uint8 vload8(size_t, const __local uint *); +#endif + + +__attribute__((always_inline)) static long8 +vldp88(size_t i, const long *p) +{ + long8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp88"))) long8 vload8(size_t, const long *); +extern __attribute__((overloadable, weak, alias("vldp88"))) ulong8 vload8(size_t, const ulong *); + + + +__attribute__((always_inline)) static long8 +vldc88(size_t i, const __constant long *p) +{ + long8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc88"))) long8 vload8(size_t, const __constant long *); +extern __attribute__((overloadable, weak, alias("vldc88"))) ulong8 vload8(size_t, const __constant ulong *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static long8 +vldg88(size_t i, const __global long *p) +{ + long8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg88"))) long8 vload8(size_t, const __global long *); +extern __attribute__((overloadable, weak, alias("vldg88"))) ulong8 vload8(size_t, const __global ulong *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static long8 +vldl88(size_t i, const __local long *p) +{ + long8 ret; + p += i * 8; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl88"))) long8 vload8(size_t, const __local long *); +extern __attribute__((overloadable, weak, alias("vldl88"))) ulong8 vload8(size_t, const __local ulong *); +#endif + + +__attribute__((always_inline)) static char16 +vldp116(size_t i, const char *p) +{ + char16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp116"))) char16 vload16(size_t, const char *); +extern __attribute__((overloadable, weak, alias("vldp116"))) uchar16 vload16(size_t, const uchar *); + + + +__attribute__((always_inline)) static char16 +vldc116(size_t i, const __constant char *p) +{ + char16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc116"))) char16 vload16(size_t, const __constant char *); +extern __attribute__((overloadable, weak, alias("vldc116"))) uchar16 vload16(size_t, const __constant uchar *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static char16 +vldg116(size_t i, const __global char *p) +{ + char16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg116"))) char16 vload16(size_t, const __global char *); +extern __attribute__((overloadable, weak, alias("vldg116"))) uchar16 vload16(size_t, const __global uchar *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static char16 +vldl116(size_t i, const __local char *p) +{ + char16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl116"))) char16 vload16(size_t, const __local char *); +extern __attribute__((overloadable, weak, alias("vldl116"))) uchar16 vload16(size_t, const __local uchar *); +#endif + + +__attribute__((always_inline)) static short16 +vldp216(size_t i, const short *p) +{ + short16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp216"))) short16 vload16(size_t, const short *); +extern __attribute__((overloadable, weak, alias("vldp216"))) ushort16 vload16(size_t, const ushort *); + + + +__attribute__((always_inline)) static short16 +vldc216(size_t i, const __constant short *p) +{ + short16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc216"))) short16 vload16(size_t, const __constant short *); +extern __attribute__((overloadable, weak, alias("vldc216"))) ushort16 vload16(size_t, const __constant ushort *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static short16 +vldg216(size_t i, const __global short *p) +{ + short16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg216"))) short16 vload16(size_t, const __global short *); +extern __attribute__((overloadable, weak, alias("vldg216"))) ushort16 vload16(size_t, const __global ushort *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static short16 +vldl216(size_t i, const __local short *p) +{ + short16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl216"))) short16 vload16(size_t, const __local short *); +extern __attribute__((overloadable, weak, alias("vldl216"))) ushort16 vload16(size_t, const __local ushort *); +#endif + + +__attribute__((always_inline)) static int16 +vldp416(size_t i, const int *p) +{ + int16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp416"))) int16 vload16(size_t, const int *); +extern __attribute__((overloadable, weak, alias("vldp416"))) uint16 vload16(size_t, const uint *); + + + +__attribute__((always_inline)) static int16 +vldc416(size_t i, const __constant int *p) +{ + int16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc416"))) int16 vload16(size_t, const __constant int *); +extern __attribute__((overloadable, weak, alias("vldc416"))) uint16 vload16(size_t, const __constant uint *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static int16 +vldg416(size_t i, const __global int *p) +{ + int16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg416"))) int16 vload16(size_t, const __global int *); +extern __attribute__((overloadable, weak, alias("vldg416"))) uint16 vload16(size_t, const __global uint *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static int16 +vldl416(size_t i, const __local int *p) +{ + int16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl416"))) int16 vload16(size_t, const __local int *); +extern __attribute__((overloadable, weak, alias("vldl416"))) uint16 vload16(size_t, const __local uint *); +#endif + + +__attribute__((always_inline)) static long16 +vldp816(size_t i, const long *p) +{ + long16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldp816"))) long16 vload16(size_t, const long *); +extern __attribute__((overloadable, weak, alias("vldp816"))) ulong16 vload16(size_t, const ulong *); + + + +__attribute__((always_inline)) static long16 +vldc816(size_t i, const __constant long *p) +{ + long16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldc816"))) long16 vload16(size_t, const __constant long *); +extern __attribute__((overloadable, weak, alias("vldc816"))) ulong16 vload16(size_t, const __constant ulong *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static long16 +vldg816(size_t i, const __global long *p) +{ + long16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldg816"))) long16 vload16(size_t, const __global long *); +extern __attribute__((overloadable, weak, alias("vldg816"))) ulong16 vload16(size_t, const __global ulong *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static long16 +vldl816(size_t i, const __local long *p) +{ + long16 ret; + p += i * 16; + ret.s0 = p[0]; + ret.s1 = p[1]; + ret.s2 = p[2]; + ret.s3 = p[3]; + ret.s4 = p[4]; + ret.s5 = p[5]; + ret.s6 = p[6]; + ret.s7 = p[7]; + ret.s8 = p[8]; + ret.s9 = p[9]; + ret.sa = p[10]; + ret.sb = p[11]; + ret.sc = p[12]; + ret.sd = p[13]; + ret.se = p[14]; + ret.sf = p[15]; + + return ret; +} +extern __attribute__((overloadable, weak, alias("vldl816"))) long16 vload16(size_t, const __local long *); +extern __attribute__((overloadable, weak, alias("vldl816"))) ulong16 vload16(size_t, const __local ulong *); +#endif + + +__attribute__((always_inline)) static void +vstp12(char2 v, size_t i, char *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstp12"))) void vstore2( char2, size_t, char *); +extern __attribute__((overloadable, weak, alias("vstp12"))) void vstore2(uchar2, size_t, uchar *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg12(char2 v, size_t i, __global char *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstg12"))) void vstore2( char2, size_t, __global char *); +extern __attribute__((overloadable, weak, alias("vstg12"))) void vstore2(uchar2, size_t, __global uchar *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl12(char2 v, size_t i, __local char *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstl12"))) void vstore2( char2, size_t, __local char *); +extern __attribute__((overloadable, weak, alias("vstl12"))) void vstore2(uchar2, size_t, __local uchar *); +#endif + + +__attribute__((always_inline)) static void +vstp22(short2 v, size_t i, short *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstp22"))) void vstore2( short2, size_t, short *); +extern __attribute__((overloadable, weak, alias("vstp22"))) void vstore2(ushort2, size_t, ushort *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg22(short2 v, size_t i, __global short *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstg22"))) void vstore2( short2, size_t, __global short *); +extern __attribute__((overloadable, weak, alias("vstg22"))) void vstore2(ushort2, size_t, __global ushort *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl22(short2 v, size_t i, __local short *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstl22"))) void vstore2( short2, size_t, __local short *); +extern __attribute__((overloadable, weak, alias("vstl22"))) void vstore2(ushort2, size_t, __local ushort *); +#endif + + +__attribute__((always_inline)) static void +vstp42(int2 v, size_t i, int *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstp42"))) void vstore2( int2, size_t, int *); +extern __attribute__((overloadable, weak, alias("vstp42"))) void vstore2(uint2, size_t, uint *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg42(int2 v, size_t i, __global int *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstg42"))) void vstore2( int2, size_t, __global int *); +extern __attribute__((overloadable, weak, alias("vstg42"))) void vstore2(uint2, size_t, __global uint *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl42(int2 v, size_t i, __local int *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstl42"))) void vstore2( int2, size_t, __local int *); +extern __attribute__((overloadable, weak, alias("vstl42"))) void vstore2(uint2, size_t, __local uint *); +#endif + + +__attribute__((always_inline)) static void +vstp82(long2 v, size_t i, long *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstp82"))) void vstore2( long2, size_t, long *); +extern __attribute__((overloadable, weak, alias("vstp82"))) void vstore2(ulong2, size_t, ulong *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg82(long2 v, size_t i, __global long *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstg82"))) void vstore2( long2, size_t, __global long *); +extern __attribute__((overloadable, weak, alias("vstg82"))) void vstore2(ulong2, size_t, __global ulong *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl82(long2 v, size_t i, __local long *p) +{ + p += i * 2; + p[0] = v.s0; + p[1] = v.s1; + +} +extern __attribute__((overloadable, weak, alias("vstl82"))) void vstore2( long2, size_t, __local long *); +extern __attribute__((overloadable, weak, alias("vstl82"))) void vstore2(ulong2, size_t, __local ulong *); +#endif + + +__attribute__((always_inline)) static void +vstp13(char3 v, size_t i, char *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstp13"))) void vstore3( char3, size_t, char *); +extern __attribute__((overloadable, weak, alias("vstp13"))) void vstore3(uchar3, size_t, uchar *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg13(char3 v, size_t i, __global char *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstg13"))) void vstore3( char3, size_t, __global char *); +extern __attribute__((overloadable, weak, alias("vstg13"))) void vstore3(uchar3, size_t, __global uchar *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl13(char3 v, size_t i, __local char *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstl13"))) void vstore3( char3, size_t, __local char *); +extern __attribute__((overloadable, weak, alias("vstl13"))) void vstore3(uchar3, size_t, __local uchar *); +#endif + + +__attribute__((always_inline)) static void +vstp23(short3 v, size_t i, short *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstp23"))) void vstore3( short3, size_t, short *); +extern __attribute__((overloadable, weak, alias("vstp23"))) void vstore3(ushort3, size_t, ushort *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg23(short3 v, size_t i, __global short *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstg23"))) void vstore3( short3, size_t, __global short *); +extern __attribute__((overloadable, weak, alias("vstg23"))) void vstore3(ushort3, size_t, __global ushort *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl23(short3 v, size_t i, __local short *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstl23"))) void vstore3( short3, size_t, __local short *); +extern __attribute__((overloadable, weak, alias("vstl23"))) void vstore3(ushort3, size_t, __local ushort *); +#endif + + +__attribute__((always_inline)) static void +vstp43(int3 v, size_t i, int *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstp43"))) void vstore3( int3, size_t, int *); +extern __attribute__((overloadable, weak, alias("vstp43"))) void vstore3(uint3, size_t, uint *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg43(int3 v, size_t i, __global int *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstg43"))) void vstore3( int3, size_t, __global int *); +extern __attribute__((overloadable, weak, alias("vstg43"))) void vstore3(uint3, size_t, __global uint *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl43(int3 v, size_t i, __local int *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstl43"))) void vstore3( int3, size_t, __local int *); +extern __attribute__((overloadable, weak, alias("vstl43"))) void vstore3(uint3, size_t, __local uint *); +#endif + + +__attribute__((always_inline)) static void +vstp83(long3 v, size_t i, long *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstp83"))) void vstore3( long3, size_t, long *); +extern __attribute__((overloadable, weak, alias("vstp83"))) void vstore3(ulong3, size_t, ulong *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg83(long3 v, size_t i, __global long *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstg83"))) void vstore3( long3, size_t, __global long *); +extern __attribute__((overloadable, weak, alias("vstg83"))) void vstore3(ulong3, size_t, __global ulong *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl83(long3 v, size_t i, __local long *p) +{ + p += i * 3; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + +} +extern __attribute__((overloadable, weak, alias("vstl83"))) void vstore3( long3, size_t, __local long *); +extern __attribute__((overloadable, weak, alias("vstl83"))) void vstore3(ulong3, size_t, __local ulong *); +#endif + + +__attribute__((always_inline)) static void +vstp14(char4 v, size_t i, char *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstp14"))) void vstore4( char4, size_t, char *); +extern __attribute__((overloadable, weak, alias("vstp14"))) void vstore4(uchar4, size_t, uchar *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg14(char4 v, size_t i, __global char *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstg14"))) void vstore4( char4, size_t, __global char *); +extern __attribute__((overloadable, weak, alias("vstg14"))) void vstore4(uchar4, size_t, __global uchar *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl14(char4 v, size_t i, __local char *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstl14"))) void vstore4( char4, size_t, __local char *); +extern __attribute__((overloadable, weak, alias("vstl14"))) void vstore4(uchar4, size_t, __local uchar *); +#endif + + +__attribute__((always_inline)) static void +vstp24(short4 v, size_t i, short *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstp24"))) void vstore4( short4, size_t, short *); +extern __attribute__((overloadable, weak, alias("vstp24"))) void vstore4(ushort4, size_t, ushort *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg24(short4 v, size_t i, __global short *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstg24"))) void vstore4( short4, size_t, __global short *); +extern __attribute__((overloadable, weak, alias("vstg24"))) void vstore4(ushort4, size_t, __global ushort *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl24(short4 v, size_t i, __local short *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstl24"))) void vstore4( short4, size_t, __local short *); +extern __attribute__((overloadable, weak, alias("vstl24"))) void vstore4(ushort4, size_t, __local ushort *); +#endif + + +__attribute__((always_inline)) static void +vstp44(int4 v, size_t i, int *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstp44"))) void vstore4( int4, size_t, int *); +extern __attribute__((overloadable, weak, alias("vstp44"))) void vstore4(uint4, size_t, uint *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg44(int4 v, size_t i, __global int *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstg44"))) void vstore4( int4, size_t, __global int *); +extern __attribute__((overloadable, weak, alias("vstg44"))) void vstore4(uint4, size_t, __global uint *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl44(int4 v, size_t i, __local int *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstl44"))) void vstore4( int4, size_t, __local int *); +extern __attribute__((overloadable, weak, alias("vstl44"))) void vstore4(uint4, size_t, __local uint *); +#endif + + +__attribute__((always_inline)) static void +vstp84(long4 v, size_t i, long *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstp84"))) void vstore4( long4, size_t, long *); +extern __attribute__((overloadable, weak, alias("vstp84"))) void vstore4(ulong4, size_t, ulong *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg84(long4 v, size_t i, __global long *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstg84"))) void vstore4( long4, size_t, __global long *); +extern __attribute__((overloadable, weak, alias("vstg84"))) void vstore4(ulong4, size_t, __global ulong *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl84(long4 v, size_t i, __local long *p) +{ + p += i * 4; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + +} +extern __attribute__((overloadable, weak, alias("vstl84"))) void vstore4( long4, size_t, __local long *); +extern __attribute__((overloadable, weak, alias("vstl84"))) void vstore4(ulong4, size_t, __local ulong *); +#endif + + +__attribute__((always_inline)) static void +vstp18(char8 v, size_t i, char *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstp18"))) void vstore8( char8, size_t, char *); +extern __attribute__((overloadable, weak, alias("vstp18"))) void vstore8(uchar8, size_t, uchar *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg18(char8 v, size_t i, __global char *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstg18"))) void vstore8( char8, size_t, __global char *); +extern __attribute__((overloadable, weak, alias("vstg18"))) void vstore8(uchar8, size_t, __global uchar *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl18(char8 v, size_t i, __local char *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstl18"))) void vstore8( char8, size_t, __local char *); +extern __attribute__((overloadable, weak, alias("vstl18"))) void vstore8(uchar8, size_t, __local uchar *); +#endif + + +__attribute__((always_inline)) static void +vstp28(short8 v, size_t i, short *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstp28"))) void vstore8( short8, size_t, short *); +extern __attribute__((overloadable, weak, alias("vstp28"))) void vstore8(ushort8, size_t, ushort *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg28(short8 v, size_t i, __global short *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstg28"))) void vstore8( short8, size_t, __global short *); +extern __attribute__((overloadable, weak, alias("vstg28"))) void vstore8(ushort8, size_t, __global ushort *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl28(short8 v, size_t i, __local short *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstl28"))) void vstore8( short8, size_t, __local short *); +extern __attribute__((overloadable, weak, alias("vstl28"))) void vstore8(ushort8, size_t, __local ushort *); +#endif + + +__attribute__((always_inline)) static void +vstp48(int8 v, size_t i, int *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstp48"))) void vstore8( int8, size_t, int *); +extern __attribute__((overloadable, weak, alias("vstp48"))) void vstore8(uint8, size_t, uint *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg48(int8 v, size_t i, __global int *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstg48"))) void vstore8( int8, size_t, __global int *); +extern __attribute__((overloadable, weak, alias("vstg48"))) void vstore8(uint8, size_t, __global uint *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl48(int8 v, size_t i, __local int *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstl48"))) void vstore8( int8, size_t, __local int *); +extern __attribute__((overloadable, weak, alias("vstl48"))) void vstore8(uint8, size_t, __local uint *); +#endif + + +__attribute__((always_inline)) static void +vstp88(long8 v, size_t i, long *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstp88"))) void vstore8( long8, size_t, long *); +extern __attribute__((overloadable, weak, alias("vstp88"))) void vstore8(ulong8, size_t, ulong *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg88(long8 v, size_t i, __global long *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstg88"))) void vstore8( long8, size_t, __global long *); +extern __attribute__((overloadable, weak, alias("vstg88"))) void vstore8(ulong8, size_t, __global ulong *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl88(long8 v, size_t i, __local long *p) +{ + p += i * 8; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + +} +extern __attribute__((overloadable, weak, alias("vstl88"))) void vstore8( long8, size_t, __local long *); +extern __attribute__((overloadable, weak, alias("vstl88"))) void vstore8(ulong8, size_t, __local ulong *); +#endif + + +__attribute__((always_inline)) static void +vstp116(char16 v, size_t i, char *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstp116"))) void vstore16( char16, size_t, char *); +extern __attribute__((overloadable, weak, alias("vstp116"))) void vstore16(uchar16, size_t, uchar *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg116(char16 v, size_t i, __global char *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstg116"))) void vstore16( char16, size_t, __global char *); +extern __attribute__((overloadable, weak, alias("vstg116"))) void vstore16(uchar16, size_t, __global uchar *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl116(char16 v, size_t i, __local char *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstl116"))) void vstore16( char16, size_t, __local char *); +extern __attribute__((overloadable, weak, alias("vstl116"))) void vstore16(uchar16, size_t, __local uchar *); +#endif + + +__attribute__((always_inline)) static void +vstp216(short16 v, size_t i, short *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstp216"))) void vstore16( short16, size_t, short *); +extern __attribute__((overloadable, weak, alias("vstp216"))) void vstore16(ushort16, size_t, ushort *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg216(short16 v, size_t i, __global short *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstg216"))) void vstore16( short16, size_t, __global short *); +extern __attribute__((overloadable, weak, alias("vstg216"))) void vstore16(ushort16, size_t, __global ushort *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl216(short16 v, size_t i, __local short *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstl216"))) void vstore16( short16, size_t, __local short *); +extern __attribute__((overloadable, weak, alias("vstl216"))) void vstore16(ushort16, size_t, __local ushort *); +#endif + + +__attribute__((always_inline)) static void +vstp416(int16 v, size_t i, int *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstp416"))) void vstore16( int16, size_t, int *); +extern __attribute__((overloadable, weak, alias("vstp416"))) void vstore16(uint16, size_t, uint *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg416(int16 v, size_t i, __global int *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstg416"))) void vstore16( int16, size_t, __global int *); +extern __attribute__((overloadable, weak, alias("vstg416"))) void vstore16(uint16, size_t, __global uint *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl416(int16 v, size_t i, __local int *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstl416"))) void vstore16( int16, size_t, __local int *); +extern __attribute__((overloadable, weak, alias("vstl416"))) void vstore16(uint16, size_t, __local uint *); +#endif + + +__attribute__((always_inline)) static void +vstp816(long16 v, size_t i, long *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstp816"))) void vstore16( long16, size_t, long *); +extern __attribute__((overloadable, weak, alias("vstp816"))) void vstore16(ulong16, size_t, ulong *); + + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstg816(long16 v, size_t i, __global long *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstg816"))) void vstore16( long16, size_t, __global long *); +extern __attribute__((overloadable, weak, alias("vstg816"))) void vstore16(ulong16, size_t, __global ulong *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +__attribute__((always_inline)) static void +vstl816(long16 v, size_t i, __local long *p) +{ + p += i * 16; + p[0] = v.s0; + p[1] = v.s1; + p[2] = v.s2; + p[3] = v.s3; + p[4] = v.s4; + p[5] = v.s5; + p[6] = v.s6; + p[7] = v.s7; + p[8] = v.s8; + p[9] = v.s9; + p[10] = v.sa; + p[11] = v.sb; + p[12] = v.sc; + p[13] = v.sd; + p[14] = v.se; + p[15] = v.sf; + +} +extern __attribute__((overloadable, weak, alias("vstl816"))) void vstore16( long16, size_t, __local long *); +extern __attribute__((overloadable, weak, alias("vstl816"))) void vstore16(ulong16, size_t, __local ulong *); +#endif +
diff --git a/amd-builtins/vldst/vldst_half.cl b/amd-builtins/vldst/vldst_half.cl new file mode 100644 index 0000000..069cded --- /dev/null +++ b/amd-builtins/vldst/vldst_half.cl
@@ -0,0 +1,4237 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + + +extern __attribute__((pure)) float __cvt_f16_to_f32(ushort); + +__attribute__((always_inline)) static float +vldhp(size_t i, const half *p) +{ + ushort h = *(const short *)(p + i); + return __cvt_f16_to_f32(h); +} +extern __attribute__((overloadable, weak, alias("vldhp"))) float vload_half(size_t, const half *); +extern __attribute__((overloadable, weak, alias("vldhp"))) float vloada_half(size_t, const half *); + + + +extern __attribute__((pure)) float __cvt_f16_to_f32(ushort); + +__attribute__((always_inline)) static float +vldhc(size_t i, const __constant half *p) +{ + ushort h = *(const __constant short *)(p + i); + return __cvt_f16_to_f32(h); +} +extern __attribute__((overloadable, weak, alias("vldhc"))) float vload_half(size_t, const __constant half *); +extern __attribute__((overloadable, weak, alias("vldhc"))) float vloada_half(size_t, const __constant half *); + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float __cvt_f16_to_f32(ushort); + +__attribute__((always_inline)) static float +vldhg(size_t i, const __global half *p) +{ + ushort h = *(const __global short *)(p + i); + return __cvt_f16_to_f32(h); +} +extern __attribute__((overloadable, weak, alias("vldhg"))) float vload_half(size_t, const __global half *); +extern __attribute__((overloadable, weak, alias("vldhg"))) float vloada_half(size_t, const __global half *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float __cvt_f16_to_f32(ushort); + +__attribute__((always_inline)) static float +vldhl(size_t i, const __local half *p) +{ + ushort h = *(const __local short *)(p + i); + return __cvt_f16_to_f32(h); +} +extern __attribute__((overloadable, weak, alias("vldhl"))) float vload_half(size_t, const __local half *); +extern __attribute__((overloadable, weak, alias("vldhl"))) float vloada_half(size_t, const __local half *); +#endif + + +extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2); + +__attribute__((overloadable, always_inline, weak)) float2 +vload_half2(size_t i, const half *p) +{ + return __cvt_2f16_to_2f32(vload2(i, (const ushort *)p)); +} + + + +extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2); + +__attribute__((overloadable, always_inline, weak)) float2 +vload_half2(size_t i, const __constant half *p) +{ + return __cvt_2f16_to_2f32(vload2(i, (const __constant ushort *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2); + +__attribute__((overloadable, always_inline, weak)) float2 +vload_half2(size_t i, const __global half *p) +{ + return __cvt_2f16_to_2f32(vload2(i, (const __global ushort *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2); + +__attribute__((overloadable, always_inline, weak)) float2 +vload_half2(size_t i, const __local half *p) +{ + return __cvt_2f16_to_2f32(vload2(i, (const __local ushort *)p)); +} +#endif + + +extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3); + +__attribute__((overloadable, always_inline, weak)) float3 +vload_half3(size_t i, const half *p) +{ + return __cvt_3f16_to_3f32(vload3(i, (const ushort *)p)); +} + + + +extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3); + +__attribute__((overloadable, always_inline, weak)) float3 +vload_half3(size_t i, const __constant half *p) +{ + return __cvt_3f16_to_3f32(vload3(i, (const __constant ushort *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3); + +__attribute__((overloadable, always_inline, weak)) float3 +vload_half3(size_t i, const __global half *p) +{ + return __cvt_3f16_to_3f32(vload3(i, (const __global ushort *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3); + +__attribute__((overloadable, always_inline, weak)) float3 +vload_half3(size_t i, const __local half *p) +{ + return __cvt_3f16_to_3f32(vload3(i, (const __local ushort *)p)); +} +#endif + + +extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4); + +__attribute__((overloadable, always_inline, weak)) float4 +vload_half4(size_t i, const half *p) +{ + return __cvt_4f16_to_4f32(vload4(i, (const ushort *)p)); +} + + + +extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4); + +__attribute__((overloadable, always_inline, weak)) float4 +vload_half4(size_t i, const __constant half *p) +{ + return __cvt_4f16_to_4f32(vload4(i, (const __constant ushort *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4); + +__attribute__((overloadable, always_inline, weak)) float4 +vload_half4(size_t i, const __global half *p) +{ + return __cvt_4f16_to_4f32(vload4(i, (const __global ushort *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4); + +__attribute__((overloadable, always_inline, weak)) float4 +vload_half4(size_t i, const __local half *p) +{ + return __cvt_4f16_to_4f32(vload4(i, (const __local ushort *)p)); +} +#endif + + +extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8); + +__attribute__((overloadable, always_inline, weak)) float8 +vload_half8(size_t i, const half *p) +{ + return __cvt_8f16_to_8f32(vload8(i, (const ushort *)p)); +} + + + +extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8); + +__attribute__((overloadable, always_inline, weak)) float8 +vload_half8(size_t i, const __constant half *p) +{ + return __cvt_8f16_to_8f32(vload8(i, (const __constant ushort *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8); + +__attribute__((overloadable, always_inline, weak)) float8 +vload_half8(size_t i, const __global half *p) +{ + return __cvt_8f16_to_8f32(vload8(i, (const __global ushort *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8); + +__attribute__((overloadable, always_inline, weak)) float8 +vload_half8(size_t i, const __local half *p) +{ + return __cvt_8f16_to_8f32(vload8(i, (const __local ushort *)p)); +} +#endif + + +extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16); + +__attribute__((overloadable, always_inline, weak)) float16 +vload_half16(size_t i, const half *p) +{ + return __cvt_16f16_to_16f32(vload16(i, (const ushort *)p)); +} + + + +extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16); + +__attribute__((overloadable, always_inline, weak)) float16 +vload_half16(size_t i, const __constant half *p) +{ + return __cvt_16f16_to_16f32(vload16(i, (const __constant ushort *)p)); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16); + +__attribute__((overloadable, always_inline, weak)) float16 +vload_half16(size_t i, const __global half *p) +{ + return __cvt_16f16_to_16f32(vload16(i, (const __global ushort *)p)); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16); + +__attribute__((overloadable, always_inline, weak)) float16 +vload_half16(size_t i, const __local half *p) +{ + return __cvt_16f16_to_16f32(vload16(i, (const __local ushort *)p)); +} +#endif + + +extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2); + +__attribute__((overloadable, always_inline, weak)) float2 +vloada_half2(size_t i, const half *p) +{ + + return __cvt_2f16_to_2f32(*(const ushort2 *)(p + i * 2)); + +} + + + +extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2); + +__attribute__((overloadable, always_inline, weak)) float2 +vloada_half2(size_t i, const __constant half *p) +{ + + return __cvt_2f16_to_2f32(*(const __constant ushort2 *)(p + i * 2)); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2); + +__attribute__((overloadable, always_inline, weak)) float2 +vloada_half2(size_t i, const __global half *p) +{ + + return __cvt_2f16_to_2f32(*(const __global ushort2 *)(p + i * 2)); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2); + +__attribute__((overloadable, always_inline, weak)) float2 +vloada_half2(size_t i, const __local half *p) +{ + + return __cvt_2f16_to_2f32(*(const __local ushort2 *)(p + i * 2)); + +} +#endif + + +extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3); + +__attribute__((overloadable, always_inline, weak)) float3 +vloada_half3(size_t i, const half *p) +{ + + ushort4 h = *(const ushort4 *)(p + i * 4); + return __cvt_3f16_to_3f32(h.s012); + +} + + + +extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3); + +__attribute__((overloadable, always_inline, weak)) float3 +vloada_half3(size_t i, const __constant half *p) +{ + + ushort4 h = *(const __constant ushort4 *)(p + i * 4); + return __cvt_3f16_to_3f32(h.s012); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3); + +__attribute__((overloadable, always_inline, weak)) float3 +vloada_half3(size_t i, const __global half *p) +{ + + ushort4 h = *(const __global ushort4 *)(p + i * 4); + return __cvt_3f16_to_3f32(h.s012); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3); + +__attribute__((overloadable, always_inline, weak)) float3 +vloada_half3(size_t i, const __local half *p) +{ + + ushort4 h = *(const __local ushort4 *)(p + i * 4); + return __cvt_3f16_to_3f32(h.s012); + +} +#endif + + +extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4); + +__attribute__((overloadable, always_inline, weak)) float4 +vloada_half4(size_t i, const half *p) +{ + + return __cvt_4f16_to_4f32(*(const ushort4 *)(p + i * 4)); + +} + + + +extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4); + +__attribute__((overloadable, always_inline, weak)) float4 +vloada_half4(size_t i, const __constant half *p) +{ + + return __cvt_4f16_to_4f32(*(const __constant ushort4 *)(p + i * 4)); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4); + +__attribute__((overloadable, always_inline, weak)) float4 +vloada_half4(size_t i, const __global half *p) +{ + + return __cvt_4f16_to_4f32(*(const __global ushort4 *)(p + i * 4)); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4); + +__attribute__((overloadable, always_inline, weak)) float4 +vloada_half4(size_t i, const __local half *p) +{ + + return __cvt_4f16_to_4f32(*(const __local ushort4 *)(p + i * 4)); + +} +#endif + + +extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8); + +__attribute__((overloadable, always_inline, weak)) float8 +vloada_half8(size_t i, const half *p) +{ + + return __cvt_8f16_to_8f32(*(const ushort8 *)(p + i * 8)); + +} + + + +extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8); + +__attribute__((overloadable, always_inline, weak)) float8 +vloada_half8(size_t i, const __constant half *p) +{ + + return __cvt_8f16_to_8f32(*(const __constant ushort8 *)(p + i * 8)); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8); + +__attribute__((overloadable, always_inline, weak)) float8 +vloada_half8(size_t i, const __global half *p) +{ + + return __cvt_8f16_to_8f32(*(const __global ushort8 *)(p + i * 8)); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8); + +__attribute__((overloadable, always_inline, weak)) float8 +vloada_half8(size_t i, const __local half *p) +{ + + return __cvt_8f16_to_8f32(*(const __local ushort8 *)(p + i * 8)); + +} +#endif + + +extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16); + +__attribute__((overloadable, always_inline, weak)) float16 +vloada_half16(size_t i, const half *p) +{ + + return __cvt_16f16_to_16f32(*(const ushort16 *)(p + i * 16)); + +} + + + +extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16); + +__attribute__((overloadable, always_inline, weak)) float16 +vloada_half16(size_t i, const __constant half *p) +{ + + return __cvt_16f16_to_16f32(*(const __constant ushort16 *)(p + i * 16)); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16); + +__attribute__((overloadable, always_inline, weak)) float16 +vloada_half16(size_t i, const __global half *p) +{ + + return __cvt_16f16_to_16f32(*(const __global ushort16 *)(p + i * 16)); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16); + +__attribute__((overloadable, always_inline, weak)) float16 +vloada_half16(size_t i, const __local half *p) +{ + + return __cvt_16f16_to_16f32(*(const __local ushort16 *)(p + i * 16)); + +} +#endif + + +extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float); + +__attribute__((always_inline)) static void +vsthpf32c(float v, size_t i, half *p) +{ + *(ushort *)(p + i) = __cvt_f32_to_f16_cur(v); +} +extern __attribute__((overloadable, weak, alias("vsthpf32c"))) void vstore_half(float, size_t, half *); +extern __attribute__((overloadable, weak, alias("vsthpf32c"))) void vstorea_half(float, size_t, half *); + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float); + +__attribute__((always_inline)) static void +vsthgf32c(float v, size_t i, __global half *p) +{ + *(__global ushort *)(p + i) = __cvt_f32_to_f16_cur(v); +} +extern __attribute__((overloadable, weak, alias("vsthgf32c"))) void vstore_half(float, size_t, __global half *); +extern __attribute__((overloadable, weak, alias("vsthgf32c"))) void vstorea_half(float, size_t, __global half *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float); + +__attribute__((always_inline)) static void +vsthlf32c(float v, size_t i, __local half *p) +{ + *(__local ushort *)(p + i) = __cvt_f32_to_f16_cur(v); +} +extern __attribute__((overloadable, weak, alias("vsthlf32c"))) void vstore_half(float, size_t, __local half *); +extern __attribute__((overloadable, weak, alias("vsthlf32c"))) void vstorea_half(float, size_t, __local half *); +#endif + + +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float); + +__attribute__((always_inline)) static void +vsthpf32e(float v, size_t i, half *p) +{ + *(ushort *)(p + i) = __cvt_f32_to_f16_rte(v); +} +extern __attribute__((overloadable, weak, alias("vsthpf32e"))) void vstore_half_rte(float, size_t, half *); +extern __attribute__((overloadable, weak, alias("vsthpf32e"))) void vstorea_half_rte(float, size_t, half *); + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float); + +__attribute__((always_inline)) static void +vsthgf32e(float v, size_t i, __global half *p) +{ + *(__global ushort *)(p + i) = __cvt_f32_to_f16_rte(v); +} +extern __attribute__((overloadable, weak, alias("vsthgf32e"))) void vstore_half_rte(float, size_t, __global half *); +extern __attribute__((overloadable, weak, alias("vsthgf32e"))) void vstorea_half_rte(float, size_t, __global half *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float); + +__attribute__((always_inline)) static void +vsthlf32e(float v, size_t i, __local half *p) +{ + *(__local ushort *)(p + i) = __cvt_f32_to_f16_rte(v); +} +extern __attribute__((overloadable, weak, alias("vsthlf32e"))) void vstore_half_rte(float, size_t, __local half *); +extern __attribute__((overloadable, weak, alias("vsthlf32e"))) void vstorea_half_rte(float, size_t, __local half *); +#endif + + +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float); + +__attribute__((always_inline)) static void +vsthpf32p(float v, size_t i, half *p) +{ + *(ushort *)(p + i) = __cvt_f32_to_f16_rtp(v); +} +extern __attribute__((overloadable, weak, alias("vsthpf32p"))) void vstore_half_rtp(float, size_t, half *); +extern __attribute__((overloadable, weak, alias("vsthpf32p"))) void vstorea_half_rtp(float, size_t, half *); + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float); + +__attribute__((always_inline)) static void +vsthgf32p(float v, size_t i, __global half *p) +{ + *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtp(v); +} +extern __attribute__((overloadable, weak, alias("vsthgf32p"))) void vstore_half_rtp(float, size_t, __global half *); +extern __attribute__((overloadable, weak, alias("vsthgf32p"))) void vstorea_half_rtp(float, size_t, __global half *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float); + +__attribute__((always_inline)) static void +vsthlf32p(float v, size_t i, __local half *p) +{ + *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtp(v); +} +extern __attribute__((overloadable, weak, alias("vsthlf32p"))) void vstore_half_rtp(float, size_t, __local half *); +extern __attribute__((overloadable, weak, alias("vsthlf32p"))) void vstorea_half_rtp(float, size_t, __local half *); +#endif + + +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float); + +__attribute__((always_inline)) static void +vsthpf32n(float v, size_t i, half *p) +{ + *(ushort *)(p + i) = __cvt_f32_to_f16_rtn(v); +} +extern __attribute__((overloadable, weak, alias("vsthpf32n"))) void vstore_half_rtn(float, size_t, half *); +extern __attribute__((overloadable, weak, alias("vsthpf32n"))) void vstorea_half_rtn(float, size_t, half *); + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float); + +__attribute__((always_inline)) static void +vsthgf32n(float v, size_t i, __global half *p) +{ + *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtn(v); +} +extern __attribute__((overloadable, weak, alias("vsthgf32n"))) void vstore_half_rtn(float, size_t, __global half *); +extern __attribute__((overloadable, weak, alias("vsthgf32n"))) void vstorea_half_rtn(float, size_t, __global half *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float); + +__attribute__((always_inline)) static void +vsthlf32n(float v, size_t i, __local half *p) +{ + *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtn(v); +} +extern __attribute__((overloadable, weak, alias("vsthlf32n"))) void vstore_half_rtn(float, size_t, __local half *); +extern __attribute__((overloadable, weak, alias("vsthlf32n"))) void vstorea_half_rtn(float, size_t, __local half *); +#endif + + +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float); + +__attribute__((always_inline)) static void +vsthpf32z(float v, size_t i, half *p) +{ + *(ushort *)(p + i) = __cvt_f32_to_f16_rtz(v); +} +extern __attribute__((overloadable, weak, alias("vsthpf32z"))) void vstore_half_rtz(float, size_t, half *); +extern __attribute__((overloadable, weak, alias("vsthpf32z"))) void vstorea_half_rtz(float, size_t, half *); + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float); + +__attribute__((always_inline)) static void +vsthgf32z(float v, size_t i, __global half *p) +{ + *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtz(v); +} +extern __attribute__((overloadable, weak, alias("vsthgf32z"))) void vstore_half_rtz(float, size_t, __global half *); +extern __attribute__((overloadable, weak, alias("vsthgf32z"))) void vstorea_half_rtz(float, size_t, __global half *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float); + +__attribute__((always_inline)) static void +vsthlf32z(float v, size_t i, __local half *p) +{ + *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtz(v); +} +extern __attribute__((overloadable, weak, alias("vsthlf32z"))) void vstore_half_rtz(float, size_t, __local half *); +extern __attribute__((overloadable, weak, alias("vsthlf32z"))) void vstorea_half_rtz(float, size_t, __local half *); +#endif + + +extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double); + +__attribute__((always_inline)) static void +vsthpf64c(double v, size_t i, half *p) +{ + *(ushort *)(p + i) = __cvt_f64_to_f16_cur(v); +} +extern __attribute__((overloadable, weak, alias("vsthpf64c"))) void vstore_half(double, size_t, half *); +extern __attribute__((overloadable, weak, alias("vsthpf64c"))) void vstorea_half(double, size_t, half *); + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double); + +__attribute__((always_inline)) static void +vsthgf64c(double v, size_t i, __global half *p) +{ + *(__global ushort *)(p + i) = __cvt_f64_to_f16_cur(v); +} +extern __attribute__((overloadable, weak, alias("vsthgf64c"))) void vstore_half(double, size_t, __global half *); +extern __attribute__((overloadable, weak, alias("vsthgf64c"))) void vstorea_half(double, size_t, __global half *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double); + +__attribute__((always_inline)) static void +vsthlf64c(double v, size_t i, __local half *p) +{ + *(__local ushort *)(p + i) = __cvt_f64_to_f16_cur(v); +} +extern __attribute__((overloadable, weak, alias("vsthlf64c"))) void vstore_half(double, size_t, __local half *); +extern __attribute__((overloadable, weak, alias("vsthlf64c"))) void vstorea_half(double, size_t, __local half *); +#endif + + +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double); + +__attribute__((always_inline)) static void +vsthpf64e(double v, size_t i, half *p) +{ + *(ushort *)(p + i) = __cvt_f64_to_f16_rte(v); +} +extern __attribute__((overloadable, weak, alias("vsthpf64e"))) void vstore_half_rte(double, size_t, half *); +extern __attribute__((overloadable, weak, alias("vsthpf64e"))) void vstorea_half_rte(double, size_t, half *); + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double); + +__attribute__((always_inline)) static void +vsthgf64e(double v, size_t i, __global half *p) +{ + *(__global ushort *)(p + i) = __cvt_f64_to_f16_rte(v); +} +extern __attribute__((overloadable, weak, alias("vsthgf64e"))) void vstore_half_rte(double, size_t, __global half *); +extern __attribute__((overloadable, weak, alias("vsthgf64e"))) void vstorea_half_rte(double, size_t, __global half *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double); + +__attribute__((always_inline)) static void +vsthlf64e(double v, size_t i, __local half *p) +{ + *(__local ushort *)(p + i) = __cvt_f64_to_f16_rte(v); +} +extern __attribute__((overloadable, weak, alias("vsthlf64e"))) void vstore_half_rte(double, size_t, __local half *); +extern __attribute__((overloadable, weak, alias("vsthlf64e"))) void vstorea_half_rte(double, size_t, __local half *); +#endif + + +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double); + +__attribute__((always_inline)) static void +vsthpf64p(double v, size_t i, half *p) +{ + *(ushort *)(p + i) = __cvt_f64_to_f16_rtp(v); +} +extern __attribute__((overloadable, weak, alias("vsthpf64p"))) void vstore_half_rtp(double, size_t, half *); +extern __attribute__((overloadable, weak, alias("vsthpf64p"))) void vstorea_half_rtp(double, size_t, half *); + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double); + +__attribute__((always_inline)) static void +vsthgf64p(double v, size_t i, __global half *p) +{ + *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtp(v); +} +extern __attribute__((overloadable, weak, alias("vsthgf64p"))) void vstore_half_rtp(double, size_t, __global half *); +extern __attribute__((overloadable, weak, alias("vsthgf64p"))) void vstorea_half_rtp(double, size_t, __global half *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double); + +__attribute__((always_inline)) static void +vsthlf64p(double v, size_t i, __local half *p) +{ + *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtp(v); +} +extern __attribute__((overloadable, weak, alias("vsthlf64p"))) void vstore_half_rtp(double, size_t, __local half *); +extern __attribute__((overloadable, weak, alias("vsthlf64p"))) void vstorea_half_rtp(double, size_t, __local half *); +#endif + + +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double); + +__attribute__((always_inline)) static void +vsthpf64n(double v, size_t i, half *p) +{ + *(ushort *)(p + i) = __cvt_f64_to_f16_rtn(v); +} +extern __attribute__((overloadable, weak, alias("vsthpf64n"))) void vstore_half_rtn(double, size_t, half *); +extern __attribute__((overloadable, weak, alias("vsthpf64n"))) void vstorea_half_rtn(double, size_t, half *); + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double); + +__attribute__((always_inline)) static void +vsthgf64n(double v, size_t i, __global half *p) +{ + *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtn(v); +} +extern __attribute__((overloadable, weak, alias("vsthgf64n"))) void vstore_half_rtn(double, size_t, __global half *); +extern __attribute__((overloadable, weak, alias("vsthgf64n"))) void vstorea_half_rtn(double, size_t, __global half *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double); + +__attribute__((always_inline)) static void +vsthlf64n(double v, size_t i, __local half *p) +{ + *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtn(v); +} +extern __attribute__((overloadable, weak, alias("vsthlf64n"))) void vstore_half_rtn(double, size_t, __local half *); +extern __attribute__((overloadable, weak, alias("vsthlf64n"))) void vstorea_half_rtn(double, size_t, __local half *); +#endif + + +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double); + +__attribute__((always_inline)) static void +vsthpf64z(double v, size_t i, half *p) +{ + *(ushort *)(p + i) = __cvt_f64_to_f16_rtz(v); +} +extern __attribute__((overloadable, weak, alias("vsthpf64z"))) void vstore_half_rtz(double, size_t, half *); +extern __attribute__((overloadable, weak, alias("vsthpf64z"))) void vstorea_half_rtz(double, size_t, half *); + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double); + +__attribute__((always_inline)) static void +vsthgf64z(double v, size_t i, __global half *p) +{ + *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtz(v); +} +extern __attribute__((overloadable, weak, alias("vsthgf64z"))) void vstore_half_rtz(double, size_t, __global half *); +extern __attribute__((overloadable, weak, alias("vsthgf64z"))) void vstorea_half_rtz(double, size_t, __global half *); +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double); + +__attribute__((always_inline)) static void +vsthlf64z(double v, size_t i, __local half *p) +{ + *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtz(v); +} +extern __attribute__((overloadable, weak, alias("vsthlf64z"))) void vstore_half_rtz(double, size_t, __local half *); +extern __attribute__((overloadable, weak, alias("vsthlf64z"))) void vstorea_half_rtz(double, size_t, __local half *); +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2(float2 v, size_t i, half *p) +{ + vstore2(__cvt_2f32_to_2f16_cur(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2(float2 v, size_t i, __global half *p) +{ + vstore2(__cvt_2f32_to_2f16_cur(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2(float2 v, size_t i, __local half *p) +{ + vstore2(__cvt_2f32_to_2f16_cur(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rte(float2 v, size_t i, half *p) +{ + vstore2(__cvt_2f32_to_2f16_rte(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rte(float2 v, size_t i, __global half *p) +{ + vstore2(__cvt_2f32_to_2f16_rte(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rte(float2 v, size_t i, __local half *p) +{ + vstore2(__cvt_2f32_to_2f16_rte(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtp(float2 v, size_t i, half *p) +{ + vstore2(__cvt_2f32_to_2f16_rtp(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtp(float2 v, size_t i, __global half *p) +{ + vstore2(__cvt_2f32_to_2f16_rtp(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtp(float2 v, size_t i, __local half *p) +{ + vstore2(__cvt_2f32_to_2f16_rtp(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtn(float2 v, size_t i, half *p) +{ + vstore2(__cvt_2f32_to_2f16_rtn(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtn(float2 v, size_t i, __global half *p) +{ + vstore2(__cvt_2f32_to_2f16_rtn(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtn(float2 v, size_t i, __local half *p) +{ + vstore2(__cvt_2f32_to_2f16_rtn(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtz(float2 v, size_t i, half *p) +{ + vstore2(__cvt_2f32_to_2f16_rtz(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtz(float2 v, size_t i, __global half *p) +{ + vstore2(__cvt_2f32_to_2f16_rtz(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtz(float2 v, size_t i, __local half *p) +{ + vstore2(__cvt_2f32_to_2f16_rtz(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3(float3 v, size_t i, half *p) +{ + vstore3(__cvt_3f32_to_3f16_cur(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3(float3 v, size_t i, __global half *p) +{ + vstore3(__cvt_3f32_to_3f16_cur(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3(float3 v, size_t i, __local half *p) +{ + vstore3(__cvt_3f32_to_3f16_cur(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rte(float3 v, size_t i, half *p) +{ + vstore3(__cvt_3f32_to_3f16_rte(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rte(float3 v, size_t i, __global half *p) +{ + vstore3(__cvt_3f32_to_3f16_rte(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rte(float3 v, size_t i, __local half *p) +{ + vstore3(__cvt_3f32_to_3f16_rte(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtp(float3 v, size_t i, half *p) +{ + vstore3(__cvt_3f32_to_3f16_rtp(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtp(float3 v, size_t i, __global half *p) +{ + vstore3(__cvt_3f32_to_3f16_rtp(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtp(float3 v, size_t i, __local half *p) +{ + vstore3(__cvt_3f32_to_3f16_rtp(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtn(float3 v, size_t i, half *p) +{ + vstore3(__cvt_3f32_to_3f16_rtn(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtn(float3 v, size_t i, __global half *p) +{ + vstore3(__cvt_3f32_to_3f16_rtn(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtn(float3 v, size_t i, __local half *p) +{ + vstore3(__cvt_3f32_to_3f16_rtn(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtz(float3 v, size_t i, half *p) +{ + vstore3(__cvt_3f32_to_3f16_rtz(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtz(float3 v, size_t i, __global half *p) +{ + vstore3(__cvt_3f32_to_3f16_rtz(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtz(float3 v, size_t i, __local half *p) +{ + vstore3(__cvt_3f32_to_3f16_rtz(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4(float4 v, size_t i, half *p) +{ + vstore4(__cvt_4f32_to_4f16_cur(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4(float4 v, size_t i, __global half *p) +{ + vstore4(__cvt_4f32_to_4f16_cur(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4(float4 v, size_t i, __local half *p) +{ + vstore4(__cvt_4f32_to_4f16_cur(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rte(float4 v, size_t i, half *p) +{ + vstore4(__cvt_4f32_to_4f16_rte(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rte(float4 v, size_t i, __global half *p) +{ + vstore4(__cvt_4f32_to_4f16_rte(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rte(float4 v, size_t i, __local half *p) +{ + vstore4(__cvt_4f32_to_4f16_rte(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtp(float4 v, size_t i, half *p) +{ + vstore4(__cvt_4f32_to_4f16_rtp(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtp(float4 v, size_t i, __global half *p) +{ + vstore4(__cvt_4f32_to_4f16_rtp(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtp(float4 v, size_t i, __local half *p) +{ + vstore4(__cvt_4f32_to_4f16_rtp(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtn(float4 v, size_t i, half *p) +{ + vstore4(__cvt_4f32_to_4f16_rtn(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtn(float4 v, size_t i, __global half *p) +{ + vstore4(__cvt_4f32_to_4f16_rtn(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtn(float4 v, size_t i, __local half *p) +{ + vstore4(__cvt_4f32_to_4f16_rtn(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtz(float4 v, size_t i, half *p) +{ + vstore4(__cvt_4f32_to_4f16_rtz(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtz(float4 v, size_t i, __global half *p) +{ + vstore4(__cvt_4f32_to_4f16_rtz(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtz(float4 v, size_t i, __local half *p) +{ + vstore4(__cvt_4f32_to_4f16_rtz(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8(float8 v, size_t i, half *p) +{ + vstore8(__cvt_8f32_to_8f16_cur(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8(float8 v, size_t i, __global half *p) +{ + vstore8(__cvt_8f32_to_8f16_cur(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8(float8 v, size_t i, __local half *p) +{ + vstore8(__cvt_8f32_to_8f16_cur(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rte(float8 v, size_t i, half *p) +{ + vstore8(__cvt_8f32_to_8f16_rte(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rte(float8 v, size_t i, __global half *p) +{ + vstore8(__cvt_8f32_to_8f16_rte(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rte(float8 v, size_t i, __local half *p) +{ + vstore8(__cvt_8f32_to_8f16_rte(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtp(float8 v, size_t i, half *p) +{ + vstore8(__cvt_8f32_to_8f16_rtp(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtp(float8 v, size_t i, __global half *p) +{ + vstore8(__cvt_8f32_to_8f16_rtp(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtp(float8 v, size_t i, __local half *p) +{ + vstore8(__cvt_8f32_to_8f16_rtp(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtn(float8 v, size_t i, half *p) +{ + vstore8(__cvt_8f32_to_8f16_rtn(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtn(float8 v, size_t i, __global half *p) +{ + vstore8(__cvt_8f32_to_8f16_rtn(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtn(float8 v, size_t i, __local half *p) +{ + vstore8(__cvt_8f32_to_8f16_rtn(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtz(float8 v, size_t i, half *p) +{ + vstore8(__cvt_8f32_to_8f16_rtz(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtz(float8 v, size_t i, __global half *p) +{ + vstore8(__cvt_8f32_to_8f16_rtz(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtz(float8 v, size_t i, __local half *p) +{ + vstore8(__cvt_8f32_to_8f16_rtz(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16(float16 v, size_t i, half *p) +{ + vstore16(__cvt_16f32_to_16f16_cur(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16(float16 v, size_t i, __global half *p) +{ + vstore16(__cvt_16f32_to_16f16_cur(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16(float16 v, size_t i, __local half *p) +{ + vstore16(__cvt_16f32_to_16f16_cur(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rte(float16 v, size_t i, half *p) +{ + vstore16(__cvt_16f32_to_16f16_rte(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rte(float16 v, size_t i, __global half *p) +{ + vstore16(__cvt_16f32_to_16f16_rte(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rte(float16 v, size_t i, __local half *p) +{ + vstore16(__cvt_16f32_to_16f16_rte(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtp(float16 v, size_t i, half *p) +{ + vstore16(__cvt_16f32_to_16f16_rtp(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtp(float16 v, size_t i, __global half *p) +{ + vstore16(__cvt_16f32_to_16f16_rtp(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtp(float16 v, size_t i, __local half *p) +{ + vstore16(__cvt_16f32_to_16f16_rtp(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtn(float16 v, size_t i, half *p) +{ + vstore16(__cvt_16f32_to_16f16_rtn(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtn(float16 v, size_t i, __global half *p) +{ + vstore16(__cvt_16f32_to_16f16_rtn(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtn(float16 v, size_t i, __local half *p) +{ + vstore16(__cvt_16f32_to_16f16_rtn(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtz(float16 v, size_t i, half *p) +{ + vstore16(__cvt_16f32_to_16f16_rtz(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtz(float16 v, size_t i, __global half *p) +{ + vstore16(__cvt_16f32_to_16f16_rtz(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtz(float16 v, size_t i, __local half *p) +{ + vstore16(__cvt_16f32_to_16f16_rtz(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2(double2 v, size_t i, half *p) +{ + vstore2(__cvt_2f64_to_2f16_cur(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2(double2 v, size_t i, __global half *p) +{ + vstore2(__cvt_2f64_to_2f16_cur(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2(double2 v, size_t i, __local half *p) +{ + vstore2(__cvt_2f64_to_2f16_cur(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rte(double2 v, size_t i, half *p) +{ + vstore2(__cvt_2f64_to_2f16_rte(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rte(double2 v, size_t i, __global half *p) +{ + vstore2(__cvt_2f64_to_2f16_rte(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rte(double2 v, size_t i, __local half *p) +{ + vstore2(__cvt_2f64_to_2f16_rte(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtp(double2 v, size_t i, half *p) +{ + vstore2(__cvt_2f64_to_2f16_rtp(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtp(double2 v, size_t i, __global half *p) +{ + vstore2(__cvt_2f64_to_2f16_rtp(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtp(double2 v, size_t i, __local half *p) +{ + vstore2(__cvt_2f64_to_2f16_rtp(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtn(double2 v, size_t i, half *p) +{ + vstore2(__cvt_2f64_to_2f16_rtn(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtn(double2 v, size_t i, __global half *p) +{ + vstore2(__cvt_2f64_to_2f16_rtn(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtn(double2 v, size_t i, __local half *p) +{ + vstore2(__cvt_2f64_to_2f16_rtn(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtz(double2 v, size_t i, half *p) +{ + vstore2(__cvt_2f64_to_2f16_rtz(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtz(double2 v, size_t i, __global half *p) +{ + vstore2(__cvt_2f64_to_2f16_rtz(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half2_rtz(double2 v, size_t i, __local half *p) +{ + vstore2(__cvt_2f64_to_2f16_rtz(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3(double3 v, size_t i, half *p) +{ + vstore3(__cvt_3f64_to_3f16_cur(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3(double3 v, size_t i, __global half *p) +{ + vstore3(__cvt_3f64_to_3f16_cur(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3(double3 v, size_t i, __local half *p) +{ + vstore3(__cvt_3f64_to_3f16_cur(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rte(double3 v, size_t i, half *p) +{ + vstore3(__cvt_3f64_to_3f16_rte(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rte(double3 v, size_t i, __global half *p) +{ + vstore3(__cvt_3f64_to_3f16_rte(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rte(double3 v, size_t i, __local half *p) +{ + vstore3(__cvt_3f64_to_3f16_rte(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtp(double3 v, size_t i, half *p) +{ + vstore3(__cvt_3f64_to_3f16_rtp(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtp(double3 v, size_t i, __global half *p) +{ + vstore3(__cvt_3f64_to_3f16_rtp(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtp(double3 v, size_t i, __local half *p) +{ + vstore3(__cvt_3f64_to_3f16_rtp(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtn(double3 v, size_t i, half *p) +{ + vstore3(__cvt_3f64_to_3f16_rtn(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtn(double3 v, size_t i, __global half *p) +{ + vstore3(__cvt_3f64_to_3f16_rtn(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtn(double3 v, size_t i, __local half *p) +{ + vstore3(__cvt_3f64_to_3f16_rtn(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtz(double3 v, size_t i, half *p) +{ + vstore3(__cvt_3f64_to_3f16_rtz(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtz(double3 v, size_t i, __global half *p) +{ + vstore3(__cvt_3f64_to_3f16_rtz(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half3_rtz(double3 v, size_t i, __local half *p) +{ + vstore3(__cvt_3f64_to_3f16_rtz(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4(double4 v, size_t i, half *p) +{ + vstore4(__cvt_4f64_to_4f16_cur(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4(double4 v, size_t i, __global half *p) +{ + vstore4(__cvt_4f64_to_4f16_cur(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4(double4 v, size_t i, __local half *p) +{ + vstore4(__cvt_4f64_to_4f16_cur(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rte(double4 v, size_t i, half *p) +{ + vstore4(__cvt_4f64_to_4f16_rte(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rte(double4 v, size_t i, __global half *p) +{ + vstore4(__cvt_4f64_to_4f16_rte(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rte(double4 v, size_t i, __local half *p) +{ + vstore4(__cvt_4f64_to_4f16_rte(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtp(double4 v, size_t i, half *p) +{ + vstore4(__cvt_4f64_to_4f16_rtp(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtp(double4 v, size_t i, __global half *p) +{ + vstore4(__cvt_4f64_to_4f16_rtp(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtp(double4 v, size_t i, __local half *p) +{ + vstore4(__cvt_4f64_to_4f16_rtp(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtn(double4 v, size_t i, half *p) +{ + vstore4(__cvt_4f64_to_4f16_rtn(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtn(double4 v, size_t i, __global half *p) +{ + vstore4(__cvt_4f64_to_4f16_rtn(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtn(double4 v, size_t i, __local half *p) +{ + vstore4(__cvt_4f64_to_4f16_rtn(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtz(double4 v, size_t i, half *p) +{ + vstore4(__cvt_4f64_to_4f16_rtz(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtz(double4 v, size_t i, __global half *p) +{ + vstore4(__cvt_4f64_to_4f16_rtz(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half4_rtz(double4 v, size_t i, __local half *p) +{ + vstore4(__cvt_4f64_to_4f16_rtz(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8(double8 v, size_t i, half *p) +{ + vstore8(__cvt_8f64_to_8f16_cur(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8(double8 v, size_t i, __global half *p) +{ + vstore8(__cvt_8f64_to_8f16_cur(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8(double8 v, size_t i, __local half *p) +{ + vstore8(__cvt_8f64_to_8f16_cur(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rte(double8 v, size_t i, half *p) +{ + vstore8(__cvt_8f64_to_8f16_rte(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rte(double8 v, size_t i, __global half *p) +{ + vstore8(__cvt_8f64_to_8f16_rte(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rte(double8 v, size_t i, __local half *p) +{ + vstore8(__cvt_8f64_to_8f16_rte(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtp(double8 v, size_t i, half *p) +{ + vstore8(__cvt_8f64_to_8f16_rtp(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtp(double8 v, size_t i, __global half *p) +{ + vstore8(__cvt_8f64_to_8f16_rtp(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtp(double8 v, size_t i, __local half *p) +{ + vstore8(__cvt_8f64_to_8f16_rtp(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtn(double8 v, size_t i, half *p) +{ + vstore8(__cvt_8f64_to_8f16_rtn(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtn(double8 v, size_t i, __global half *p) +{ + vstore8(__cvt_8f64_to_8f16_rtn(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtn(double8 v, size_t i, __local half *p) +{ + vstore8(__cvt_8f64_to_8f16_rtn(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtz(double8 v, size_t i, half *p) +{ + vstore8(__cvt_8f64_to_8f16_rtz(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtz(double8 v, size_t i, __global half *p) +{ + vstore8(__cvt_8f64_to_8f16_rtz(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half8_rtz(double8 v, size_t i, __local half *p) +{ + vstore8(__cvt_8f64_to_8f16_rtz(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16(double16 v, size_t i, half *p) +{ + vstore16(__cvt_16f64_to_16f16_cur(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16(double16 v, size_t i, __global half *p) +{ + vstore16(__cvt_16f64_to_16f16_cur(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16(double16 v, size_t i, __local half *p) +{ + vstore16(__cvt_16f64_to_16f16_cur(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rte(double16 v, size_t i, half *p) +{ + vstore16(__cvt_16f64_to_16f16_rte(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rte(double16 v, size_t i, __global half *p) +{ + vstore16(__cvt_16f64_to_16f16_rte(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rte(double16 v, size_t i, __local half *p) +{ + vstore16(__cvt_16f64_to_16f16_rte(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtp(double16 v, size_t i, half *p) +{ + vstore16(__cvt_16f64_to_16f16_rtp(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtp(double16 v, size_t i, __global half *p) +{ + vstore16(__cvt_16f64_to_16f16_rtp(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtp(double16 v, size_t i, __local half *p) +{ + vstore16(__cvt_16f64_to_16f16_rtp(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtn(double16 v, size_t i, half *p) +{ + vstore16(__cvt_16f64_to_16f16_rtn(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtn(double16 v, size_t i, __global half *p) +{ + vstore16(__cvt_16f64_to_16f16_rtn(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtn(double16 v, size_t i, __local half *p) +{ + vstore16(__cvt_16f64_to_16f16_rtn(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtz(double16 v, size_t i, half *p) +{ + vstore16(__cvt_16f64_to_16f16_rtz(v), i, (ushort *)p); +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtz(double16 v, size_t i, __global half *p) +{ + vstore16(__cvt_16f64_to_16f16_rtz(v), i, (__global ushort *)p); +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstore_half16_rtz(double16 v, size_t i, __local half *p) +{ + vstore16(__cvt_16f64_to_16f16_rtz(v), i, (__local ushort *)p); +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2(float2 v, size_t i, half *p) +{ + + *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2(float2 v, size_t i, __global half *p) +{ + + *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2(float2 v, size_t i, __local half *p) +{ + + *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v); + +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rte(float2 v, size_t i, half *p) +{ + + *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rte(float2 v, size_t i, __global half *p) +{ + + *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rte(float2 v, size_t i, __local half *p) +{ + + *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v); + +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtp(float2 v, size_t i, half *p) +{ + + *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtp(float2 v, size_t i, __global half *p) +{ + + *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtp(float2 v, size_t i, __local half *p) +{ + + *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v); + +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtn(float2 v, size_t i, half *p) +{ + + *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtn(float2 v, size_t i, __global half *p) +{ + + *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtn(float2 v, size_t i, __local half *p) +{ + + *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v); + +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtz(float2 v, size_t i, half *p) +{ + + *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtz(float2 v, size_t i, __global half *p) +{ + + *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtz(float2 v, size_t i, __local half *p) +{ + + *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v); + +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3(float3 v, size_t i, half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_cur(v); + *(ushort4 *)(p + i * 4) = h; + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3(float3 v, size_t i, __global half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_cur(v); + *(__global ushort4 *)(p + i * 4) = h; + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3(float3 v, size_t i, __local half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_cur(v); + *(__local ushort4 *)(p + i * 4) = h; + +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rte(float3 v, size_t i, half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rte(v); + *(ushort4 *)(p + i * 4) = h; + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rte(float3 v, size_t i, __global half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rte(v); + *(__global ushort4 *)(p + i * 4) = h; + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rte(float3 v, size_t i, __local half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rte(v); + *(__local ushort4 *)(p + i * 4) = h; + +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtp(float3 v, size_t i, half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rtp(v); + *(ushort4 *)(p + i * 4) = h; + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtp(float3 v, size_t i, __global half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rtp(v); + *(__global ushort4 *)(p + i * 4) = h; + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtp(float3 v, size_t i, __local half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rtp(v); + *(__local ushort4 *)(p + i * 4) = h; + +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtn(float3 v, size_t i, half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rtn(v); + *(ushort4 *)(p + i * 4) = h; + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtn(float3 v, size_t i, __global half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rtn(v); + *(__global ushort4 *)(p + i * 4) = h; + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtn(float3 v, size_t i, __local half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rtn(v); + *(__local ushort4 *)(p + i * 4) = h; + +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtz(float3 v, size_t i, half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rtz(v); + *(ushort4 *)(p + i * 4) = h; + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtz(float3 v, size_t i, __global half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rtz(v); + *(__global ushort4 *)(p + i * 4) = h; + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtz(float3 v, size_t i, __local half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f32_to_3f16_rtz(v); + *(__local ushort4 *)(p + i * 4) = h; + +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4(float4 v, size_t i, half *p) +{ + + *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4(float4 v, size_t i, __global half *p) +{ + + *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4(float4 v, size_t i, __local half *p) +{ + + *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v); + +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rte(float4 v, size_t i, half *p) +{ + + *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rte(float4 v, size_t i, __global half *p) +{ + + *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rte(float4 v, size_t i, __local half *p) +{ + + *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v); + +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtp(float4 v, size_t i, half *p) +{ + + *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtp(float4 v, size_t i, __global half *p) +{ + + *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtp(float4 v, size_t i, __local half *p) +{ + + *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v); + +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtn(float4 v, size_t i, half *p) +{ + + *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtn(float4 v, size_t i, __global half *p) +{ + + *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtn(float4 v, size_t i, __local half *p) +{ + + *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v); + +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtz(float4 v, size_t i, half *p) +{ + + *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtz(float4 v, size_t i, __global half *p) +{ + + *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtz(float4 v, size_t i, __local half *p) +{ + + *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v); + +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8(float8 v, size_t i, half *p) +{ + + *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8(float8 v, size_t i, __global half *p) +{ + + *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8(float8 v, size_t i, __local half *p) +{ + + *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v); + +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rte(float8 v, size_t i, half *p) +{ + + *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rte(float8 v, size_t i, __global half *p) +{ + + *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rte(float8 v, size_t i, __local half *p) +{ + + *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v); + +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtp(float8 v, size_t i, half *p) +{ + + *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtp(float8 v, size_t i, __global half *p) +{ + + *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtp(float8 v, size_t i, __local half *p) +{ + + *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v); + +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtn(float8 v, size_t i, half *p) +{ + + *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtn(float8 v, size_t i, __global half *p) +{ + + *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtn(float8 v, size_t i, __local half *p) +{ + + *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v); + +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtz(float8 v, size_t i, half *p) +{ + + *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtz(float8 v, size_t i, __global half *p) +{ + + *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtz(float8 v, size_t i, __local half *p) +{ + + *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v); + +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16(float16 v, size_t i, half *p) +{ + + *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16(float16 v, size_t i, __global half *p) +{ + + *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16(float16 v, size_t i, __local half *p) +{ + + *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v); + +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rte(float16 v, size_t i, half *p) +{ + + *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rte(float16 v, size_t i, __global half *p) +{ + + *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rte(float16 v, size_t i, __local half *p) +{ + + *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v); + +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtp(float16 v, size_t i, half *p) +{ + + *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtp(float16 v, size_t i, __global half *p) +{ + + *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtp(float16 v, size_t i, __local half *p) +{ + + *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v); + +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtn(float16 v, size_t i, half *p) +{ + + *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtn(float16 v, size_t i, __global half *p) +{ + + *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtn(float16 v, size_t i, __local half *p) +{ + + *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v); + +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtz(float16 v, size_t i, half *p) +{ + + *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtz(float16 v, size_t i, __global half *p) +{ + + *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtz(float16 v, size_t i, __local half *p) +{ + + *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v); + +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2(double2 v, size_t i, half *p) +{ + + *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2(double2 v, size_t i, __global half *p) +{ + + *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2(double2 v, size_t i, __local half *p) +{ + + *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v); + +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rte(double2 v, size_t i, half *p) +{ + + *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rte(double2 v, size_t i, __global half *p) +{ + + *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rte(double2 v, size_t i, __local half *p) +{ + + *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v); + +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtp(double2 v, size_t i, half *p) +{ + + *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtp(double2 v, size_t i, __global half *p) +{ + + *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtp(double2 v, size_t i, __local half *p) +{ + + *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v); + +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtn(double2 v, size_t i, half *p) +{ + + *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtn(double2 v, size_t i, __global half *p) +{ + + *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtn(double2 v, size_t i, __local half *p) +{ + + *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v); + +} +#endif + + +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtz(double2 v, size_t i, half *p) +{ + + *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtz(double2 v, size_t i, __global half *p) +{ + + *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half2_rtz(double2 v, size_t i, __local half *p) +{ + + *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v); + +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3(double3 v, size_t i, half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_cur(v); + *(ushort4 *)(p + i * 4) = h; + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3(double3 v, size_t i, __global half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_cur(v); + *(__global ushort4 *)(p + i * 4) = h; + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3(double3 v, size_t i, __local half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_cur(v); + *(__local ushort4 *)(p + i * 4) = h; + +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rte(double3 v, size_t i, half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rte(v); + *(ushort4 *)(p + i * 4) = h; + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rte(double3 v, size_t i, __global half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rte(v); + *(__global ushort4 *)(p + i * 4) = h; + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rte(double3 v, size_t i, __local half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rte(v); + *(__local ushort4 *)(p + i * 4) = h; + +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtp(double3 v, size_t i, half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rtp(v); + *(ushort4 *)(p + i * 4) = h; + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtp(double3 v, size_t i, __global half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rtp(v); + *(__global ushort4 *)(p + i * 4) = h; + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtp(double3 v, size_t i, __local half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rtp(v); + *(__local ushort4 *)(p + i * 4) = h; + +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtn(double3 v, size_t i, half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rtn(v); + *(ushort4 *)(p + i * 4) = h; + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtn(double3 v, size_t i, __global half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rtn(v); + *(__global ushort4 *)(p + i * 4) = h; + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtn(double3 v, size_t i, __local half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rtn(v); + *(__local ushort4 *)(p + i * 4) = h; + +} +#endif + + +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtz(double3 v, size_t i, half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rtz(v); + *(ushort4 *)(p + i * 4) = h; + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtz(double3 v, size_t i, __global half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rtz(v); + *(__global ushort4 *)(p + i * 4) = h; + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half3_rtz(double3 v, size_t i, __local half *p) +{ + + ushort4 h; + h.s012 = __cvt_3f64_to_3f16_rtz(v); + *(__local ushort4 *)(p + i * 4) = h; + +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4(double4 v, size_t i, half *p) +{ + + *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4(double4 v, size_t i, __global half *p) +{ + + *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4(double4 v, size_t i, __local half *p) +{ + + *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v); + +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rte(double4 v, size_t i, half *p) +{ + + *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rte(double4 v, size_t i, __global half *p) +{ + + *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rte(double4 v, size_t i, __local half *p) +{ + + *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v); + +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtp(double4 v, size_t i, half *p) +{ + + *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtp(double4 v, size_t i, __global half *p) +{ + + *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtp(double4 v, size_t i, __local half *p) +{ + + *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v); + +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtn(double4 v, size_t i, half *p) +{ + + *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtn(double4 v, size_t i, __global half *p) +{ + + *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtn(double4 v, size_t i, __local half *p) +{ + + *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v); + +} +#endif + + +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtz(double4 v, size_t i, half *p) +{ + + *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtz(double4 v, size_t i, __global half *p) +{ + + *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half4_rtz(double4 v, size_t i, __local half *p) +{ + + *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v); + +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8(double8 v, size_t i, half *p) +{ + + *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8(double8 v, size_t i, __global half *p) +{ + + *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8(double8 v, size_t i, __local half *p) +{ + + *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v); + +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rte(double8 v, size_t i, half *p) +{ + + *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rte(double8 v, size_t i, __global half *p) +{ + + *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rte(double8 v, size_t i, __local half *p) +{ + + *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v); + +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtp(double8 v, size_t i, half *p) +{ + + *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtp(double8 v, size_t i, __global half *p) +{ + + *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtp(double8 v, size_t i, __local half *p) +{ + + *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v); + +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtn(double8 v, size_t i, half *p) +{ + + *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtn(double8 v, size_t i, __global half *p) +{ + + *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtn(double8 v, size_t i, __local half *p) +{ + + *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v); + +} +#endif + + +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtz(double8 v, size_t i, half *p) +{ + + *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtz(double8 v, size_t i, __global half *p) +{ + + *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half8_rtz(double8 v, size_t i, __local half *p) +{ + + *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v); + +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16(double16 v, size_t i, half *p) +{ + + *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16(double16 v, size_t i, __global half *p) +{ + + *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16(double16 v, size_t i, __local half *p) +{ + + *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v); + +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rte(double16 v, size_t i, half *p) +{ + + *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rte(double16 v, size_t i, __global half *p) +{ + + *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rte(double16 v, size_t i, __local half *p) +{ + + *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v); + +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtp(double16 v, size_t i, half *p) +{ + + *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtp(double16 v, size_t i, __global half *p) +{ + + *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtp(double16 v, size_t i, __local half *p) +{ + + *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v); + +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtn(double16 v, size_t i, half *p) +{ + + *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtn(double16 v, size_t i, __global half *p) +{ + + *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtn(double16 v, size_t i, __local half *p) +{ + + *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v); + +} +#endif + + +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtz(double16 v, size_t i, half *p) +{ + + *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v); + +} + + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtz(double16 v, size_t i, __global half *p) +{ + + *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v); + +} +#endif + +#if __OPENCL_C_VERSION__ < 200 +extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16); + +__attribute__((overloadable, always_inline, weak)) void +vstorea_half16_rtz(double16 v, size_t i, __local half *p) +{ + + *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v); + +} +#endif
diff --git a/amd-builtins/workgroup/wg.h b/amd-builtins/workgroup/wg.h new file mode 100644 index 0000000..f3d969f --- /dev/null +++ b/amd-builtins/workgroup/wg.h
@@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// XXX The runtime computes CL_DEVICE_MAX_WORK_GROUP_SIZE as +// XXX dev->wave_front_size * dev->max_waves_per_simd +// XXX If max_waves_per_simd is ever raised then this code will need to be updated +#define MAX_WAVES_PER_SIMD 4 + +#pragma OPENCL EXTENSION cl_amd_program_scope_locals : enable +extern __local ulong __wg_scratch[MAX_WAVES_PER_SIMD]; +
diff --git a/amd-builtins/workgroup/wganyall.cl b/amd-builtins/workgroup/wganyall.cl new file mode 100644 index 0000000..2daa659 --- /dev/null +++ b/amd-builtins/workgroup/wganyall.cl
@@ -0,0 +1,61 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __OPENCL_C_VERSION__ >= 200 + +#include "wg.h" + +#define GEN_AA(SUF,ID) \ +__attribute__((overloadable, always_inline)) int \ +work_group_##SUF(int predicate) \ +{ \ + uint n = get_num_sub_groups(); \ + int a = sub_group_##SUF(predicate); \ + if (n == 1) \ + return a; \ + \ + __local int *p = (__local int *)__wg_scratch; \ + uint l = get_sub_group_local_id(); \ + uint i = get_sub_group_id(); \ + \ + if (l == 0) \ + p[i] = a; \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + if (i == 0) { \ + a = l < n ? p[l] : ID; \ + a = sub_group_##SUF(a); \ + if (l == 0) \ + p[0] = a; \ + } \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + a = p[0]; \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + \ + return a; \ +} + +GEN_AA(all, 1U) +GEN_AA(any, 0U); + +#endif +
diff --git a/amd-builtins/workgroup/wgbarrier.cl b/amd-builtins/workgroup/wgbarrier.cl new file mode 100644 index 0000000..a682e21 --- /dev/null +++ b/amd-builtins/workgroup/wgbarrier.cl
@@ -0,0 +1,42 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __OPENCL_C_VERSION__ >= 200 + +extern void __hsail_barrier(void); + +__attribute__((overloadable, weak, always_inline)) void +work_group_barrier(cl_mem_fence_flags flags, memory_scope scope) +{ + atomic_work_item_fence(flags, memory_order_release, scope); + __hsail_barrier(); + atomic_work_item_fence(flags, memory_order_acquire, scope); +} + +__attribute__((overloadable, weak, always_inline)) void +work_group_barrier(cl_mem_fence_flags flags) +{ + work_group_barrier(flags, memory_scope_work_group); +} + +#endif +
diff --git a/amd-builtins/workgroup/wgbcast.cl b/amd-builtins/workgroup/wgbcast.cl new file mode 100644 index 0000000..f279a1f --- /dev/null +++ b/amd-builtins/workgroup/wgbcast.cl
@@ -0,0 +1,75 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __OPENCL_C_VERSION__ >= 200 + +#include "wg.h" + +#define GEN_BROADCAST(TYPE) \ +__attribute__((overloadable,weak,always_inline)) TYPE \ +work_group_broadcast(TYPE a, size_t local_id_x) \ +{ \ + if (get_num_sub_groups() == 1) \ + return sub_group_broadcast(a, local_id_x); \ + \ + __local TYPE *p = (__local TYPE *)__wg_scratch; \ + if (get_local_id(0) == local_id_x) \ + *p = a; \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + a = *p; \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + return a; \ +} \ +\ +__attribute__((overloadable,weak,always_inline)) TYPE \ +work_group_broadcast(TYPE a, size_t local_id_x, size_t local_id_y) \ +{ \ + __local TYPE *p = (__local TYPE *)__wg_scratch; \ + if (get_local_id(0) == local_id_x && get_local_id(1) == local_id_y) \ + *p = a; \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + a = *p; \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + return a; \ +} \ +\ +__attribute__((overloadable,weak,always_inline)) TYPE \ +work_group_broadcast(TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z) \ +{ \ + __local TYPE *p = (__local TYPE *)__wg_scratch; \ + if (get_local_id(0) == local_id_x && get_local_id(1) == local_id_y && get_local_id(2) == local_id_z) \ + *p = a; \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + a = *p; \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + return a; \ +} + +GEN_BROADCAST(uint) +GEN_BROADCAST(int) +GEN_BROADCAST(ulong) +GEN_BROADCAST(long) +GEN_BROADCAST(float) +GEN_BROADCAST(double) + +#endif +
diff --git a/amd-builtins/workgroup/wgreduce.cl b/amd-builtins/workgroup/wgreduce.cl new file mode 100644 index 0000000..6ad4dec --- /dev/null +++ b/amd-builtins/workgroup/wgreduce.cl
@@ -0,0 +1,107 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __OPENCL_C_VERSION__ >= 200 + +#include "wg.h" + +#define GENA(TYPE) \ +__attribute__((overloadable,weak,always_inline)) TYPE \ +work_group_reduce_add(TYPE a) \ +{ \ + uint n = get_num_sub_groups(); \ + a = sub_group_reduce_add(a); \ + if (n == 1) \ + return a; \ + \ + __local TYPE *p = (__local TYPE *)__wg_scratch; \ + uint l = get_sub_group_local_id(); \ + uint i = get_sub_group_id(); \ + \ + if (l == 0) \ + p[i] = a; \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + if (i == 0) { \ + a = l < n ? p[l] : (TYPE)0; \ + a = sub_group_reduce_add(a); \ + if (l == 0) \ + p[0] = a; \ + } \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + a = p[0]; \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + return a; \ +} + +#define GENO(TYPE,SUF,ID) \ +__attribute__((overloadable,weak,always_inline)) TYPE \ +work_group_reduce_##SUF(TYPE a) \ +{ \ + uint n = get_num_sub_groups(); \ + a = sub_group_reduce_##SUF(a); \ + if (n == 1) \ + return a; \ + \ + __local TYPE *p = (__local TYPE *)__wg_scratch; \ + uint l = get_sub_group_local_id(); \ + uint i = get_sub_group_id(); \ + \ + if (l == 0) \ + p[i] = a; \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + if (i == 0) { \ + a = l < n ? p[l] : ID; \ + a = sub_group_reduce_##SUF(a); \ + if (l == 0) \ + p[0] = a; \ + } \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + a = p[0]; \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + return a; \ +} + +GENA(int) +GENA(uint) +GENA(long) +GENA(ulong) +GENA(float) +GENA(double) + +GENO(int,max,INT_MIN) +GENO(uint,max,0U) +GENO(long,max,LONG_MIN) +GENO(ulong,max,0UL) +GENO(float,max,-INFINITY) +GENO(double,max,-(double)INFINITY) + +GENO(int,min,INT_MAX) +GENO(uint,min,UINT_MAX) +GENO(long,min,LONG_MAX) +GENO(ulong,min,ULONG_MAX) +GENO(float,min,INFINITY) +GENO(double,min,(double)INFINITY) + +#endif +
diff --git a/amd-builtins/workgroup/wgscan.cl b/amd-builtins/workgroup/wgscan.cl new file mode 100644 index 0000000..f3b4606 --- /dev/null +++ b/amd-builtins/workgroup/wgscan.cl
@@ -0,0 +1,186 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "wg.h" + +#if __OPENCL_C_VERSION__ >= 200 + +#define GENIA(TYPE) \ +__attribute__((overloadable,weak,always_inline)) TYPE \ +work_group_scan_inclusive_add(TYPE a) \ +{ \ + uint n = get_num_sub_groups(); \ + a = sub_group_scan_inclusive_add(a); \ + if (n == 1) \ + return a; \ + \ + __local TYPE *p = (__local TYPE *)__wg_scratch; \ + uint l = get_sub_group_local_id(); \ + uint i = get_sub_group_id(); \ + \ + if (l == get_sub_group_size() - 1U) \ + p[i] = a; \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + if (i == 0) { \ + TYPE t = l < n ? p[l] : (TYPE)0; \ + t = sub_group_scan_inclusive_add(t); \ + if (l < n) \ + p[l] = t; \ + } \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + TYPE ret = i == 0 ? a : a + p[i-1]; \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + return ret; \ +} + +#define GENIO(TYPE,SUF,ID) \ +__attribute__((overloadable,weak,always_inline)) TYPE \ +work_group_scan_inclusive_##SUF(TYPE a) \ +{ \ + uint n = get_num_sub_groups(); \ + a = sub_group_scan_inclusive_##SUF(a); \ + if (n == 1) \ + return a; \ + \ + __local TYPE *p = (__local TYPE *)__wg_scratch; \ + uint l = get_sub_group_local_id(); \ + uint i = get_sub_group_id(); \ + \ + if (l == get_sub_group_size() - 1U) \ + p[i] = a; \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + if (i == 0) { \ + TYPE t = l < n ? p[l] : ID; \ + t = sub_group_scan_inclusive_##SUF(t); \ + if (l < n) \ + p[l] = t; \ + } \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + TYPE ret = i == 0 ? a : SUF(a, p[i-1]); \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + return ret; \ +} + +GENIA(int) +GENIA(uint) +GENIA(long) +GENIA(ulong) +GENIA(float) +GENIA(double) + +GENIO(int,max,INT_MIN) +GENIO(uint,max,0U) +GENIO(long,max,LONG_MIN) +GENIO(ulong,max,0UL) +GENIO(float,max,-INFINITY) +GENIO(double,max,-(double)INFINITY) + +GENIO(int,min,INT_MAX) +GENIO(uint,min,UINT_MAX) +GENIO(long,min,LONG_MAX) +GENIO(ulong,min,ULONG_MAX) +GENIO(float,min,INFINITY) +GENIO(double,min,(double)INFINITY) + +#define GENEA(TYPE) \ +__attribute__((overloadable,weak,always_inline)) TYPE \ +work_group_scan_exclusive_add(TYPE a) \ +{ \ + uint n = get_num_sub_groups(); \ + TYPE t = sub_group_scan_exclusive_add(a); \ + if (n == 1) \ + return t; \ + \ + __local TYPE *p = (__local TYPE *)__wg_scratch; \ + uint l = get_sub_group_local_id(); \ + uint i = get_sub_group_id(); \ + \ + if (l == get_sub_group_size() - 1U) \ + p[i] = a + t; \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + if (i == 0) { \ + TYPE s = l < n ? p[l] : (TYPE)0; \ + s = sub_group_scan_inclusive_add(s); \ + if (l < n) \ + p[l] = s; \ + } \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + TYPE ret = i == 0 ? t : t + p[i-1]; \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + return ret; \ +} + +#define GENEO(TYPE,SUF,ID) \ +__attribute__((overloadable,weak,always_inline)) TYPE \ +work_group_scan_exclusive_##SUF(TYPE a) \ +{ \ + uint n = get_num_sub_groups(); \ + TYPE t = sub_group_scan_exclusive_##SUF(a); \ + if (n == 1) \ + return t; \ + \ + __local TYPE *p = (__local TYPE *)__wg_scratch; \ + uint l = get_sub_group_local_id(); \ + uint i = get_sub_group_id(); \ + \ + if (l == get_sub_group_size() - 1U) \ + p[i] = SUF(a, t); \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + if (i == 0) { \ + TYPE s = l < n ? p[l] : ID; \ + s = sub_group_scan_inclusive_##SUF(s); \ + if (l < n) \ + p[l] = s; \ + } \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + TYPE ret = i == 0 ? t : SUF(t, p[i-1]); \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + return ret; \ +} + +GENEA(int) +GENEA(uint) +GENEA(long) +GENEA(ulong) +GENEA(float) +GENEA(double) + +GENEO(int,max,INT_MIN) +GENEO(uint,max,0U) +GENEO(long,max,LONG_MIN) +GENEO(ulong,max,0UL) +GENEO(float,max,-INFINITY) +GENEO(double,max,-(double)INFINITY) + +GENEO(int,min,INT_MAX) +GENEO(uint,min,UINT_MAX) +GENEO(long,min,LONG_MAX) +GENEO(ulong,min,ULONG_MAX) +GENEO(float,min,INFINITY) +GENEO(double,min,(double)INFINITY) + +#endif +
diff --git a/amd-builtins/workgroup/wgscratch.cl b/amd-builtins/workgroup/wgscratch.cl new file mode 100644 index 0000000..e3db83e --- /dev/null +++ b/amd-builtins/workgroup/wgscratch.cl
@@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __OPENCL_C_VERSION >= 200 + +#include "wg.h" + +// Temporary data for work group functions +__local ulong __wg_scratch[MAX_WAVES_PER_SIMD]; + +#endif