Add AMD OpenCL builtins git-svn-id: https://llvm.org/svn/llvm-project/libclc/branches/amd-builtins@219217 91177308-0d34-0410-b5e6-96231b3b80d8

commit: 1f4aa749ab83f46a3c3c21b5d6aca2d553750902 [log] [tgz]
author: Tom Stellard <thomas.stellard@amd.com> Tue Oct 07 17:10:46 2014 +0000
committer: Tom Stellard <thomas.stellard@amd.com> Tue Oct 07 17:10:46 2014 +0000
tree: da37413ec6c99862fb5852cca5ec5a44b2f30e4b
parent: 254d4dc0b9169131222d19c48b3559d53111a780 [diff]
diff --git a/amd-builtins/README b/amd-builtins/README
new file mode 100644
index 0000000..630419a
--- /dev/null
+++ b/amd-builtins/README

@@ -0,0 +1,9 @@
+This directory contains builtins from AMD's OpenCL builtin library.
+There is an ongoing effort to port these builtins to libclc.  If you
+would like to port a function, review the libclc-dev@pcc.me.uk archives
+to make sure no one is already working on it.  If no one else has started
+this port, then send an email to libclc-dev@pcc.me.uk with the subject
+
+Porting: builtin_name
+
+This way we don't have multiple people trying to port the same functions.

diff --git a/amd-builtins/conv/conversions.cl b/amd-builtins/conv/conversions.cl
new file mode 100644
index 0000000..567ba70
--- /dev/null
+++ b/amd-builtins/conv/conversions.cl

@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#define ConvIntrinPrototype(FromTy,FromSuf,ToTy,ToSuf,Rnd)  \
+extern __attribute__((pure)) \
+ToTy __cvt_##ToSuf##_##Rnd##_##FromSuf(FromTy); 
+
+#define ConvIntrinPrototypeSet(FromTy,FromSuf,ToTy,ToSuf)  \
+  ConvIntrinPrototype(FromTy,FromSuf,ToTy,ToSuf,rte)  \
+  ConvIntrinPrototype(FromTy,FromSuf,ToTy,ToSuf,rtn)  \
+  ConvIntrinPrototype(FromTy,FromSuf,ToTy,ToSuf,rtp)  \
+  ConvIntrinPrototype(FromTy,FromSuf,ToTy,ToSuf,rtz)
+
+#define FloatToIntegerRoundingConv(FromTy,FromSuf,ToTy,ToSuf,Rnd)  \
+__attribute__((always_inline)) ToTy \
+__convert_##ToTy##_##Rnd##_##FromSuf(FromTy x) { \
+  return (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(x);  \
+}
+
+#define FloatToIntegerSatRoundingConv(FromTy,FromSuf,ToTy,ToSuf,Rnd,Min,Max)  \
+__attribute__((always_inline)) ToTy \
+__convert_##ToTy##_sat_##Rnd##_##FromSuf(FromTy x) { \
+  ToTy r; \
+  if (sizeof(ToTy) >= sizeof(long)) { \
+    r = (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(x);  \
+    bool le = (x <= (FromTy)Min);  \
+    bool ge = (x >= (FromTy)Max);  \
+    r = le?((ToTy) Min):r; \
+    r = ge?((ToTy) Max):r; \
+  } else { \
+    FromTy s = min(max(x, (FromTy)Min), (FromTy)Max); \
+    r = (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(s);  \
+  } \
+  return r; \
+}
+
+#define FloatToUnsignedSatRoundingConv(FromTy,FromSuf,ToTy,ToSuf,Rnd,Max)  \
+__attribute__((always_inline)) ToTy \
+__convert_##ToTy##_sat_##Rnd##_##FromSuf(FromTy x) { \
+  ToTy r; \
+  if (sizeof(ToTy) >= sizeof(long)) { \
+    r = (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(x);  \
+    bool le = (x <= (FromTy)0);  \
+    bool ge = (x >= (FromTy)Max);  \
+    r = le?((ToTy) 0):r; \
+    r = ge?((ToTy) Max):r; \
+  } else { \
+    FromTy s = min(max(x, (FromTy)0), (FromTy)Max); \
+    r = (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(s);  \
+  } \
+  return r; \
+}
+
+#define AllFloatToIntegerRoundingConv(Ty,TySuf)  \
+  FloatToIntegerRoundingConv(float,f32,Ty,TySuf,rte)  \
+  FloatToIntegerRoundingConv(float,f32,Ty,TySuf,rtn)  \
+  FloatToIntegerRoundingConv(float,f32,Ty,TySuf,rtp)  \
+  FloatToIntegerRoundingConv(float,f32,Ty,TySuf,rtz)
+
+#define AllFloatToIntegerSatRoundingConv(Ty,TySuf,Min,Max)  \
+  FloatToIntegerSatRoundingConv(float,f32,Ty,TySuf,rte,Min,Max)  \
+  FloatToIntegerSatRoundingConv(float,f32,Ty,TySuf,rtn,Min,Max)  \
+  FloatToIntegerSatRoundingConv(float,f32,Ty,TySuf,rtp,Min,Max)  \
+  FloatToIntegerSatRoundingConv(float,f32,Ty,TySuf,rtz,Min,Max)
+
+#define AllFloatToUnsignedSatRoundingConv(Ty,TySuf,Max)  \
+  FloatToUnsignedSatRoundingConv(float,f32,Ty,TySuf,rte,Max)  \
+  FloatToUnsignedSatRoundingConv(float,f32,Ty,TySuf,rtn,Max)  \
+  FloatToUnsignedSatRoundingConv(float,f32,Ty,TySuf,rtp,Max)  \
+  FloatToUnsignedSatRoundingConv(float,f32,Ty,TySuf,rtz,Max)  
+
+#define AllDoubleToIntegerRoundingConv(Ty,TySuf)  \
+  FloatToIntegerRoundingConv(double,f64,Ty,TySuf,rte)  \
+  FloatToIntegerRoundingConv(double,f64,Ty,TySuf,rtn)  \
+  FloatToIntegerRoundingConv(double,f64,Ty,TySuf,rtp)  \
+  FloatToIntegerRoundingConv(double,f64,Ty,TySuf,rtz)
+
+#define AllDoubleToIntegerSatRoundingConv(Ty,TySuf,Min,Max)  \
+  FloatToIntegerSatRoundingConv(double,f64,Ty,TySuf,rte,Min,Max)  \
+  FloatToIntegerSatRoundingConv(double,f64,Ty,TySuf,rtn,Min,Max)  \
+  FloatToIntegerSatRoundingConv(double,f64,Ty,TySuf,rtp,Min,Max)  \
+  FloatToIntegerSatRoundingConv(double,f64,Ty,TySuf,rtz,Min,Max)
+
+#define AllDoubleToUnsignedSatRoundingConv(Ty,TySuf,Max)  \
+  FloatToUnsignedSatRoundingConv(double,f64,Ty,TySuf,rte,Max)  \
+  FloatToUnsignedSatRoundingConv(double,f64,Ty,TySuf,rtn,Max)  \
+  FloatToUnsignedSatRoundingConv(double,f64,Ty,TySuf,rtp,Max)  \
+  FloatToUnsignedSatRoundingConv(double,f64,Ty,TySuf,rtz,Max)  
+
+#define FloatingPointRoundingConv(FromTy,FromSuf,ToTy,ToSuf,Rnd)  \
+__attribute__((always_inline)) ToTy \
+__convert_##ToTy##_##Rnd##_##FromSuf(FromTy x) { \
+  return (ToTy)__cvt_##ToSuf##_##Rnd##_##FromSuf(x);  \
+}
+
+#define AllIntegerToFloatRoundingConv(Ty,TySuf)  \
+  FloatingPointRoundingConv(Ty,TySuf,float,f32,rtn)  \
+  FloatingPointRoundingConv(Ty,TySuf,float,f32,rtp)  \
+  FloatingPointRoundingConv(Ty,TySuf,float,f32,rtz)
+
+#define AllIntegerToDoubleRoundingConv(Ty,TySuf)  \
+  FloatingPointRoundingConv(Ty,TySuf,double,f64,rtn)  \
+  FloatingPointRoundingConv(Ty,TySuf,double,f64,rtp)  \
+  FloatingPointRoundingConv(Ty,TySuf,double,f64,rtz)
+
+// Prototypes for conversion intrinsics
+
+// float to integer conversion intrinsics
+ConvIntrinPrototypeSet(float,f32,int,s32)
+ConvIntrinPrototypeSet(float,f32,uint,u32)
+ConvIntrinPrototypeSet(float,f32,long,s64)
+ConvIntrinPrototypeSet(float,f32,ulong,u64)
+
+// double to integer conversion intrinsics
+ConvIntrinPrototypeSet(double,f64,int,s32)
+ConvIntrinPrototypeSet(double,f64,uint,u32)
+ConvIntrinPrototypeSet(double,f64,long,s64)
+ConvIntrinPrototypeSet(double,f64,ulong,u64)
+
+// integer to float conversion intrinsics
+ConvIntrinPrototypeSet(int,i32,float,f32)
+ConvIntrinPrototypeSet(uint,u32,float,f32)
+ConvIntrinPrototypeSet(long,i64,float,f32)
+ConvIntrinPrototypeSet(ulong,u64,float,f32)
+
+// long to double conversion intrinsics
+ConvIntrinPrototypeSet(long,i64,double,f64)
+ConvIntrinPrototypeSet(ulong,u64,double,f64)
+
+// double to float conversion intrinsics
+ConvIntrinPrototypeSet(double,f64,float,f32)
+
+// Defintions for conversion functions
+
+// float to integer conversions
+AllFloatToIntegerRoundingConv(char,s32)
+AllFloatToIntegerRoundingConv(short,s32)
+AllFloatToIntegerRoundingConv(int,s32)
+AllFloatToIntegerRoundingConv(long,s64)
+
+AllFloatToIntegerRoundingConv(uchar,u32)
+AllFloatToIntegerRoundingConv(ushort,u32)
+AllFloatToIntegerRoundingConv(uint,u32)
+AllFloatToIntegerRoundingConv(ulong,u64)
+
+AllFloatToIntegerSatRoundingConv(char,s32,CHAR_MIN,CHAR_MAX)
+AllFloatToIntegerSatRoundingConv(short,s32,SHRT_MIN,SHRT_MAX)
+AllFloatToIntegerSatRoundingConv(int,s32,INT_MIN,INT_MAX)
+AllFloatToIntegerSatRoundingConv(long,s64,LONG_MIN,LONG_MAX)
+
+AllFloatToUnsignedSatRoundingConv(uchar,u32,UCHAR_MAX)
+AllFloatToUnsignedSatRoundingConv(ushort,u32,USHRT_MAX)
+AllFloatToUnsignedSatRoundingConv(uint,u32,UINT_MAX)
+AllFloatToUnsignedSatRoundingConv(ulong,u64,ULONG_MAX)
+
+// double to integer conversions
+AllDoubleToIntegerRoundingConv(char,s32)
+AllDoubleToIntegerRoundingConv(short,s32)
+AllDoubleToIntegerRoundingConv(int,s32)
+AllDoubleToIntegerRoundingConv(long,s64)
+
+AllDoubleToIntegerRoundingConv(uchar,u32)
+AllDoubleToIntegerRoundingConv(ushort,u32)
+AllDoubleToIntegerRoundingConv(uint,u32)
+AllDoubleToIntegerRoundingConv(ulong,u64)
+
+AllDoubleToIntegerSatRoundingConv(char,s32,CHAR_MIN,CHAR_MAX)
+AllDoubleToIntegerSatRoundingConv(short,s32,SHRT_MIN,SHRT_MAX)
+AllDoubleToIntegerSatRoundingConv(int,s32,INT_MIN,INT_MAX)
+AllDoubleToIntegerSatRoundingConv(long,s64,LONG_MIN,LONG_MAX)
+
+AllDoubleToUnsignedSatRoundingConv(uchar,u32,UCHAR_MAX)
+AllDoubleToUnsignedSatRoundingConv(ushort,u32,USHRT_MAX)
+AllDoubleToUnsignedSatRoundingConv(uint,u32,UINT_MAX)
+AllDoubleToUnsignedSatRoundingConv(ulong,u64,ULONG_MAX)
+
+// integer to float
+AllIntegerToFloatRoundingConv(int,i32)
+AllIntegerToFloatRoundingConv(uint,u32)
+AllIntegerToFloatRoundingConv(long,i64)
+AllIntegerToFloatRoundingConv(ulong,u64)
+
+// long/ulong to double
+AllIntegerToDoubleRoundingConv(long,i64)
+AllIntegerToDoubleRoundingConv(ulong,u64)
+
+// double to float conversions
+FloatingPointRoundingConv(double,f64,float,f32,rtn)
+FloatingPointRoundingConv(double,f64,float,f32,rtp)
+FloatingPointRoundingConv(double,f64,float,f32,rtz)

diff --git a/amd-builtins/devenq/devenq.h b/amd-builtins/devenq/devenq.h
new file mode 100644
index 0000000..7232805
--- /dev/null
+++ b/amd-builtins/devenq/devenq.h

@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+#define CLK_ENQUEUE_FAILURE -1
+
+// XXX This was copied from runtime/device/gpu/gpuschedcl.cpp
+
+//! AmdAqlWrap slot state
+enum AqlWrapState {
+    AQL_WRAP_FREE = 0,
+    AQL_WRAP_RESERVED,
+    AQL_WRAP_READY,
+    AQL_WRAP_MARKER,
+    AQL_WRAP_BUSY,
+    AQL_WRAP_DONE
+};
+
+//! Profiling states
+enum ProfilingState {
+    PROFILING_COMMAND_START = 0,
+    PROFILING_COMMAND_END,
+    PROFILING_COMMAND_COMPLETE
+};
+
+//! OCL dispatch condition flags
+// --- this is unused in the library and I've asked German to remove
+//     it in favor of the clang enum
+enum ClFlags {
+    NO_WAIT = 0,
+    WAIT_PARENT,
+    WAIT_WORK_GROUP
+};
+
+typedef struct _HsaAqlDispatchPacket {
+    uint    mix;
+    ushort  workgroup_size[3];
+    ushort  reserved2;
+    uint    grid_size[3];
+    uint    private_segment_size_bytes;
+    uint    group_segment_size_bytes;
+    ulong   kernel_object_address;
+    ulong   kernel_arg_address;
+    ulong   reserved3;
+    ulong   completion_signal;
+} HsaAqlDispatchPacket;
+
+typedef struct _AmdVQueueHeader {
+    uint    aql_slot_num;       //!< [LRO/SRO] The total number of the AQL slots (multiple of 64).
+    uint    event_slot_num;     //!< [LRO] The number of kernel events in the events buffer
+    ulong   event_slot_mask;    //!< [LRO] A pointer to the allocation bitmask array for the events
+    ulong   event_slots;        //!< [LRO] Pointer to a buffer for the events.
+                                // Array of event_slot_num entries of AmdEvent
+    ulong   aql_slot_mask;      //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots
+    uint    command_counter;    //!< [LRW] The global counter for the submitted commands into the queue
+    uint    wait_size;          //!< [LRO] The wait list size (in clk_event_t)
+    uint    arg_size;           //!< [LRO] The size of argument buffer (in bytes)
+    uint    reserved0;          //!< For the future usage
+    ulong   kernel_table;       //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry)
+    uint    reserved[2];        //!< For the future usage
+} AmdVQueueHeader;
+
+typedef struct _AmdAqlWrap {
+    uint state;             //!< [LRW/SRW] The current state of the AQL wrapper:  FREE, RESERVED, READY,
+                            // MARKER, BUSY and DONE. The block could be returned back to a free state.
+    uint enqueue_flags;     //!< [LWO/SRO] Contains the flags for the kernel execution start -
+                            //  (KERNEL_ENQUEUE_FLAGS_T)
+                            // NO_WAIT - we just start processing
+                            // WAIT_PARENT - check if parent_wrap->state is done and then start processing
+                            // WAIT_WORK_GROUP currently == WAIT_PARENT
+    uint command_id;        //!< [LWO/SRO] The unique command ID
+    uint child_counter;     //!< [LRW/SRW] Counter that determine the launches of child kernels.
+                            // It's incremented on the
+                            // start and decremented on the finish. The parent kernel can be considered as
+                            // done when the value is 0 and the state is DONE
+    ulong completion;       //!< [LWO/SRO] CL event for the current execution (clk_event_t)
+    ulong parent_wrap;      //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
+    ulong wait_list;        //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
+    uint wait_num;          //!<  [LWO/SRO] The number of cl_event_wait objects 
+    uint reserved[5];       //!< For the future usage
+    HsaAqlDispatchPacket aql;  //!< [LWO/SRO] AQL packet - 64 bytes AQL packet
+} AmdAqlWrap;
+
+typedef struct _AmdEvent {
+    uint state;             //!< [LRO/SRW] Event state: START, END, COMPLETE
+    uint counter;           //!< [LRW] Event retain/release counter. 0 means the event is free
+    ulong timer[3];         //!< [LRO/SWO] Timer values for profiling for each state
+} AmdEvent;
+
+// XXX This is adapted from hsa.h
+
+// This is an OpenCLized hsa_control_directives_t
+typedef struct _HsaControlDirectives {
+  ulong enabled_control_directives;
+  ushort enable_break_exceptions;
+  ushort enable_detect_exceptions;
+  uint max_dynamic_group_size;
+  uint max_flat_grid_size;
+  uint max_flat_workgroup_size;
+  uint requested_workgroups_per_cu;
+  uint required_grid_size[3];
+  uint required_workgroup_size[3];
+  uchar required_dim;
+  uchar reserved[75];
+} HsaControlDirectives;
+
+// This is an OpenCLized amd_kernel_code_t
+typedef struct _AmdKernelCode {
+  uint amd_code_version_major;
+  uint amd_code_version_minor;
+  uint struct_byte_size;
+  uint target_chip;
+  ulong kernel_code_entry_byte_offset;
+  ulong kernel_code_prefetch_byte_offset;
+  ulong kernel_code_prefetch_byte_size;
+  ulong max_scratch_backing_memory_byte_size;
+  ulong compute_pgm_resource_registers;
+  uint enables_and_flags;
+  uint gds_segment_byte_size;
+  ushort debug_wavefront_private_segment_offset_sgpr;
+  ushort debug_private_segment_buffer_sgpr;
+  ushort wavefront_sgpr_count;
+  ushort workitem_vgpr_count;
+  ulong kernarg_segment_byte_size;
+  uint workitem_private_segment_byte_size;
+  uint workgroup_group_segment_byte_size;
+  uint workgroup_fbarrier_count;
+  uchar kernarg_segment_alignment;
+  uchar group_segment_alignment;
+  uchar private_segment_alignment;
+  uchar code_alignment;
+  uint code_type;
+  uint code_properties;
+  uchar wavefront_size;
+  uchar optimization_level;
+  uchar hsail_profile;
+  uchar hsail_machine_model;
+  uint hsail_version_major;
+  uint hsail_version_minor;
+  ushort hsail_target_options;
+  ushort reserved3;
+  HsaControlDirectives control_directive;
+} AmdKernelCode;
+
+
+// Library only from here
+
+// XXX this needs to match workgroup/wg.h MAX_WAVES_PER_SIMD
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE 256
+
+// ABI has 6 special leading arguments:
+//  global_offset[3], printf_buf, default vqueue pointer, and self AqlWrap pointer
+#define NUM_SPECIAL_ARGS 6
+extern __attribute__((const)) uint  __hsail_ld_kernarg_u32(uint);
+extern __attribute__((const)) ulong __hsail_ld_kernarg_u64(uint);
+
+static inline __global AmdVQueueHeader *
+get_vqueue(void)
+{
+    size_t vq;
+
+    if (sizeof(size_t) == 4)
+	vq = __hsail_ld_kernarg_u32(4*4);
+    else
+	vq = __hsail_ld_kernarg_u64(4*8);
+
+    return (__global AmdVQueueHeader *)vq;
+}
+
+static inline __global AmdAqlWrap *
+get_aql_wrap(void)
+{
+    size_t aw;
+
+    if (sizeof(size_t) == 4)
+	aw = __hsail_ld_kernarg_u32(5*4);
+    else
+	aw = __hsail_ld_kernarg_u64(5*8);
+
+    return (__global AmdAqlWrap *)aw;
+}
+
+static inline __global void *
+get_printf_ptr(void)
+{
+    size_t pb;
+
+    if (sizeof(size_t) == 4)
+        pb = __hsail_ld_kernarg_u32(3*4);
+    else
+        pb = __hsail_ld_kernarg_u64(3*8);
+
+    return (__global void *)pb;
+}
+
+typedef struct _NdRange {
+    uint dim;
+    size_t goff[3];
+    size_t gws[3];
+    size_t lws[3];
+} NdRange;
+
+// reserve a slot in a bitmask controlled resource
+// n is the number of slots
+static inline int
+reserve_slot(__global uint * restrict mask, uint n)
+{
+    n >>= 5;
+    uint i, j, k, v, vv, z;
+
+    /* Spread the starting points */
+    i = get_sub_group_local_id() % n;
+
+    /* Allow only one pass */
+    for (j=0,k=i;j<n;++j) {
+        __global atomic_uint *p = (__global atomic_uint *)(mask + k);
+        v = atomic_load_explicit(p, memory_order_acquire, memory_scope_device);
+        for (;;) {
+            z = ctz(~v);
+            if (z == 32U)
+                break;
+            vv = v | (1U << z);
+            if (atomic_compare_exchange_strong_explicit(p, &v, vv, memory_order_acq_rel, memory_order_acquire, memory_scope_device))
+                break;
+        }
+        if (z < 32U)
+            break;
+        k = k == n-1 ? 0 : k+1;
+    }
+
+    k = (k << 5) + z;
+    return z < 32U ? (int)k : -1;
+}
+
+// release slot in a bitmask controlled resource
+// i is the slot number
+static inline void
+release_slot(__global uint * restrict mask, uint i)
+{
+    /* uint b = ~(1UL << (i & 0x1f)); */
+    uint b = ~amd_bfm(1U, i);
+    __global atomic_uint *p = (__global atomic_uint *)(mask + (i >> 5));
+    uint v, vv;
+
+    v = atomic_load_explicit(p, memory_order_acquire, memory_scope_device);
+    for (;;) {
+        vv = v & b;
+        if (atomic_compare_exchange_strong_explicit(p, &v, vv, memory_order_acq_rel, memory_order_acquire, memory_scope_device))
+            break;
+    }
+}
+
+static inline uint
+align_up(uint start, uint align)
+{
+    return (start + align - 1U) & -align;
+}
+

diff --git a/amd-builtins/devenq/enqueue.cl b/amd-builtins/devenq/enqueue.cl
new file mode 100644
index 0000000..c944874
--- /dev/null
+++ b/amd-builtins/devenq/enqueue.cl

@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "devenq.h"
+
+static inline void
+copy_waitlist(__global AmdEvent **dst, __global AmdEvent **src, uint n)
+{
+    uint i;
+    for (i=0; i<n; ++i)
+        dst[i] = src[i];
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) queue_t
+get_default_queue(void)
+{
+    return (queue_t)get_vqueue();
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) int
+enqueue_marker(queue_t q, uint nwl, const clk_event_t *wl, clk_event_t *re)
+{
+    __global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q;
+    if (nwl > vq->wait_size)
+        return CLK_ENQUEUE_FAILURE;
+
+    // Get a wrap slot
+    __global uint *amask = (__global uint *)vq->aql_slot_mask;
+    int ai = reserve_slot(amask, vq->aql_slot_num);
+    if (ai < 0)
+        return CLK_ENQUEUE_FAILURE;
+
+    // Get a return event slot
+    __global uint *emask = (__global uint *)vq->event_slot_mask;
+    int ei = reserve_slot(emask, vq->event_slot_num);
+    if (ei < 0) {
+	release_slot(amask, ai);
+        return CLK_ENQUEUE_FAILURE;
+    }
+
+    // Initialize return event
+    __global AmdEvent *ev = (__global AmdEvent *)vq->event_slots + ei;
+    ev->state = CL_SUBMITTED;
+    ev->counter = 1;
+    ev->timer[0] = 0;
+    ev->timer[1] = 0;
+    ev->timer[2] = 0;
+
+    // Initialize wrap
+    __global AmdAqlWrap *me = get_aql_wrap();
+    __global AmdAqlWrap *aw = (__global AmdAqlWrap *)(vq + 1) + ai;
+
+    aw->enqueue_flags = CLK_ENQUEUE_FLAGS_NO_WAIT;
+    aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device);
+    aw->child_counter = 0;
+    aw->completion = (ulong)ev;
+    aw->parent_wrap = (ulong)me;
+
+    if (nwl > 0)
+        copy_waitlist((__global AmdEvent **)aw->wait_list, (__global AmdEvent **)wl, nwl);
+
+    aw->wait_num = nwl;
+
+    // A marker is never enqueued so ignore displatch packet
+
+    // Tell the scheduler
+    atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device);
+    atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_MARKER, memory_order_release, memory_scope_device);
+
+    *re = (clk_event_t)ev;
+    return 0;
+}
+
+// int
+// __enqueue_internal_{0,1,.,10}[_events] (
+//   queue_t q,
+//   int flags,
+//   int dims, size_t goff[3], size_t gsize[3], size_t lsize[3],
+//   __global void * something_like_function_pointer,
+//   __global void * wrap_ptr_from_prep
+//   [, uint size0, uint align0
+//    [, uint size1, uint align1
+//     [, uint size2, uint align2
+//      [, uint size3, uint align3
+//       ...]]]]]] );
+
+// Help with size and alignment handling
+#define _SA_ARGS10 _SA_ARGS9, uint sz9, uint al9
+#define _SA_ARGS9  _SA_ARGS8, uint sz8, uint al8
+#define _SA_ARGS8  _SA_ARGS7, uint sz7, uint al7
+#define _SA_ARGS7  _SA_ARGS6, uint sz6, uint al6
+#define _SA_ARGS6  _SA_ARGS5, uint sz5, uint al5
+#define _SA_ARGS5  _SA_ARGS4, uint sz4, uint al4
+#define _SA_ARGS4  _SA_ARGS3, uint sz3, uint al3
+#define _SA_ARGS3  _SA_ARGS2, uint sz2, uint al2
+#define _SA_ARGS2  _SA_ARGS1, uint sz1, uint al1
+#define _SA_ARGS1  _SA_ARGS0, uint sz0, uint al0
+#define _SA_ARGS0
+
+#define SA_ARGS(N) _SA_ARGS##N
+
+#define _SET_KARG10 _SET_KARG9; lo = align_up(lo, al9); args[6+9] = lo; lo += sz9
+#define _SET_KARG9  _SET_KARG8; lo = align_up(lo, al8); args[6+8] = lo; lo += sz8
+#define _SET_KARG8  _SET_KARG7; lo = align_up(lo, al7); args[6+7] = lo; lo += sz7
+#define _SET_KARG7  _SET_KARG6; lo = align_up(lo, al6); args[6+6] = lo; lo += sz6
+#define _SET_KARG6  _SET_KARG5; lo = align_up(lo, al5); args[6+5] = lo; lo += sz5
+#define _SET_KARG5  _SET_KARG4; lo = align_up(lo, al4); args[6+4] = lo; lo += sz4
+#define _SET_KARG4  _SET_KARG3; lo = align_up(lo, al3); args[6+3] = lo; lo += sz3
+#define _SET_KARG3  _SET_KARG2; lo = align_up(lo, al2); args[6+2] = lo; lo += sz2
+#define _SET_KARG2  _SET_KARG1; lo = align_up(lo, al1); args[6+1] = lo; lo += sz1
+#define _SET_KARG1              lo = align_up(lo, al0); args[6+0] = lo; lo += sz0
+#define _SET_KARG0
+
+#define SET_KARG(N) _SET_KARG##N
+
+#define GEN(N) \
+__attribute__((always_inline)) \
+int \
+__enqueue_internal_##N(queue_t q, int flags, ndrange_t ndr_type, \
+	               __global void *fp, __global void *aqlWrap SA_ARGS(N)) \
+{ \
+    __global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q; \
+    __global AmdAqlWrap *me = get_aql_wrap(); \
+    __global uint *amask = (__global uint *)vq->aql_slot_mask; \
+    __global AmdAqlWrap *aw = (__global AmdAqlWrap *) aqlWrap; \
+    int ai = aw - (__global AmdAqlWrap *)(vq + 1); \
+    __private NdRange *ndr = (__private NdRange *) &ndr_type; \
+ \
+    /* Skip check of dim for now */ \
+    if (mul24(mul24((uint)ndr->lws[0], (uint)ndr->lws[1]), (uint)ndr->lws[2]) > \
+	CL_DEVICE_MAX_WORK_GROUP_SIZE) { \
+	release_slot(amask, ai); \
+	return CLK_ENQUEUE_FAILURE; \
+    } \
+ \
+    /* This is the current index-based approach, not the ldk based approach */ \
+    __global AmdKernelCode **kt = (__global AmdKernelCode **)vq->kernel_table; \
+    uint ki = (uint)fp; \
+    __global AmdKernelCode *kc = kt[ki]; \
+ \
+    aw->enqueue_flags = flags; \
+ \
+    aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
+    aw->child_counter = 0; \
+    aw->completion = 0; \
+    aw->parent_wrap = (ulong)me; \
+    aw->wait_num = 0; \
+ \
+    aw->aql.mix = ((uint)ndr->dim << 16) | (0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0); \
+    aw->aql.workgroup_size[0] = (ushort)ndr->lws[0]; \
+    aw->aql.workgroup_size[1] = (ushort)ndr->lws[1]; \
+    aw->aql.workgroup_size[2] = (ushort)ndr->lws[2]; \
+    aw->aql.grid_size[0] = (uint)ndr->gws[0]; \
+    aw->aql.grid_size[1] = (uint)ndr->gws[1]; \
+    aw->aql.grid_size[2] = (uint)ndr->gws[2]; \
+    aw->aql.private_segment_size_bytes = kc->workitem_private_segment_byte_size; \
+    aw->aql.group_segment_size_bytes = 0; \
+    aw->aql.kernel_object_address = (ulong)kc; \
+    aw->aql.completion_signal = 0; \
+ \
+    /* Set non-capture arguments */ \
+    __global size_t *args = (__global size_t *)aw->aql.kernel_arg_address; \
+    args[0] = ndr->goff[0]; \
+    args[1] = ndr->goff[1]; \
+    args[2] = ndr->goff[2]; \
+    args[3] = (size_t)get_printf_ptr(); \
+    args[4] = (size_t)vq; \
+    args[5] = (size_t)aw; \
+ \
+    uint lo0 = kc->workgroup_group_segment_byte_size; \
+    uint lo = lo0; \
+    SET_KARG(N); \
+    aw->aql.group_segment_size_bytes = lo - lo0; \
+ \
+    /* Tell the scheduler */ \
+    atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
+    atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_READY, memory_order_release, memory_scope_device); \
+    return 0; \
+}
+
+GEN(0)
+GEN(1)
+GEN(2)
+GEN(3)
+GEN(4)
+GEN(5)
+GEN(6)
+GEN(7)
+GEN(8)
+GEN(9)
+GEN(10)
+
+// Now the versions with events
+
+#define EGEN(N) \
+__attribute__((always_inline)) \
+int \
+__enqueue_internal_##N##_events(queue_t q, int flags, ndrange_t ndr_type, \
+	                        uint nwl, clk_event_t *wl, clk_event_t *re, \
+	                        __global void *fp, __global void *aqlWrap SA_ARGS(N)) \
+{ \
+    __global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q; \
+    __global uint *amask = (__global uint *)vq->aql_slot_mask; \
+    __global AmdAqlWrap *aw = (__global AmdAqlWrap *) aqlWrap; \
+    int ai = aw - (__global AmdAqlWrap *)(vq + 1); \
+     __private NdRange *ndr = (__private NdRange *) &ndr_type; \
+ \
+    /* Skip check of dim for now */ \
+    if (mul24(mul24((uint)ndr->lws[0], (uint)ndr->lws[1]), (uint)ndr->lws[2]) > \
+	CL_DEVICE_MAX_WORK_GROUP_SIZE | nwl > vq->wait_size) { \
+        release_slot(amask, ai); \
+        return CLK_ENQUEUE_FAILURE; \
+    } \
+ \
+    __global AmdAqlWrap *me = get_aql_wrap(); \
+    __global AmdEvent *ev = NULL; \
+ \
+    if (re != NULL) { \
+        /* Get a return event slot */ \
+        __global uint *emask = (__global uint *)vq->event_slot_mask; \
+        int ei = reserve_slot(emask, vq->event_slot_num); \
+        if (ei < 0) { \
+            release_slot(amask, ai); \
+            return CLK_ENQUEUE_FAILURE; \
+	} \
+ \
+        /* Initialize return event */ \
+        ev = (__global AmdEvent *)vq->event_slots + ei; \
+        ev->state = CL_SUBMITTED; \
+        ev->counter = 1; \
+        ev->timer[0] = 0; \
+        ev->timer[1] = 0; \
+        ev->timer[2] = 0; \
+    } \
+ \
+    /* This is the current index-based approach, not the ldk based approach */ \
+    __global AmdKernelCode **kt = (__global AmdKernelCode **)vq->kernel_table; \
+    uint ki = (uint)fp; \
+    __global AmdKernelCode *kc = kt[ki]; \
+ \
+    aw->enqueue_flags = flags; \
+ \
+    aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
+    aw->child_counter = 0; \
+    aw->completion = 0; \
+    aw->parent_wrap = (ulong)me; \
+ \
+    aw->aql.mix = ((uint)ndr->dim << 16) | (0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0); \
+    aw->aql.workgroup_size[0] = (ushort)ndr->lws[0]; \
+    aw->aql.workgroup_size[1] = (ushort)ndr->lws[1]; \
+    aw->aql.workgroup_size[2] = (ushort)ndr->lws[2]; \
+    aw->aql.grid_size[0] = (uint)ndr->gws[0]; \
+    aw->aql.grid_size[1] = (uint)ndr->gws[1]; \
+    aw->aql.grid_size[2] = (uint)ndr->gws[2]; \
+    aw->aql.private_segment_size_bytes = kc->workitem_private_segment_byte_size; \
+    aw->aql.group_segment_size_bytes = 0; \
+    aw->aql.kernel_object_address = (ulong)kc; \
+    aw->aql.completion_signal = 0; \
+ \
+    /* Set non-capture arguments */ \
+    __global size_t *args = (__global size_t *)aw->aql.kernel_arg_address; \
+    args[0] = ndr->goff[0]; \
+    args[1] = ndr->goff[1]; \
+    args[2] = ndr->goff[2]; \
+    args[3] = (size_t)get_printf_ptr(); \
+    args[4] = (size_t)vq; \
+    args[5] = (size_t)aw; \
+ \
+    uint lo0 = kc->workgroup_group_segment_byte_size; \
+    uint lo = lo0; \
+    SET_KARG(N); \
+    aw->aql.group_segment_size_bytes = lo - lo0; \
+ \
+    /* Copy wait list */ \
+    if (nwl > 0) \
+        copy_waitlist((__global AmdEvent **)aw->wait_list, (__global AmdEvent **)wl, nwl); \
+ \
+    aw->wait_num = nwl; \
+ \
+    /* Tell the scheduler */ \
+    atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_acq_rel, memory_scope_device); \
+    atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_MARKER, memory_order_release, memory_scope_device); \
+ \
+    if (re != NULL) \
+        *re = (clk_event_t)ev; \
+ \
+    return 0; \
+}
+
+EGEN(0)
+EGEN(1)
+EGEN(2)
+EGEN(3)
+EGEN(4)
+EGEN(5)
+EGEN(6)
+EGEN(7)
+EGEN(8)
+EGEN(9)
+EGEN(10)
+
+#endif
+

diff --git a/amd-builtins/devenq/eprep.cl b/amd-builtins/devenq/eprep.cl
new file mode 100644
index 0000000..93a771d
--- /dev/null
+++ b/amd-builtins/devenq/eprep.cl

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "devenq.h"
+
+static inline void
+copy_captured_context(__global void *d, __private void *s, uint size, uint align)
+{
+    if (align == 2) {
+	__global short *d2 = (__global short *)d;
+	__private short *s2 = (__private short *)s;
+        uint i;
+        uint n = size / align;
+
+	for (i=0; i<n; ++i)
+	    d2[i] = s2[i];
+    } else if (align == 4) {
+	__global int *d4 = (__global int *)d;
+	__private int *s4 = (__private int *)s;
+        uint i;
+        uint n = size / align;
+
+	for (i=0; i<n; ++i)
+	    d4[i] = s4[i];
+    } else if (align == 8) {
+	__global long *d8 = (__global long *)d;
+	__private long *s8 = (__private long *)s;
+        uint i;
+        uint n = size / align;
+
+	for (i=0; i<n; ++i)
+	    d8[i] = s8[i];
+    } else if (align == 16) {
+	__global long2 *d16 = (__global long2 *)d;
+	__private long2 *s16 = (__private long2 *)s;
+        uint i;
+        uint n = size / align;
+
+	for (i=0; i<n; ++i)
+	    d16[i] = s16[i];
+    } else if (align == 32 || align == 64 || align == 128) {
+	__global long4 *d32 = (__global long4 *)d;
+	__private long4 *s32 = (__private long4 *)s;
+        uint i;
+        uint n = size / 32U;
+
+	for (i=0; i<n; ++i)
+	    d32[i] = s32[i];
+    } else {
+	__global char *d1 = (__global char *)d;
+	__private char *s1 = (__private char *)s;
+	uint i;
+	uint n = size;
+
+	for (i=0; i<n; ++i)
+	    d1[i] = s1[i];
+    }
+}
+
+// enqueue_prep attempts to allocate an AqlWrap and copy the
+// context into the kernarg area
+// returns:
+//   1: a int indicating the allocation is successful
+//   2: a pointer to the wrap itself to be passed to the actual enqueue
+//      call
+static int
+eprep(queue_t q, uint lsize, uint csize, uint calign, __private void *cptr, __global void* private* private wretp)
+{
+    __global AmdVQueueHeader *vq = (__global AmdVQueueHeader *)q;
+
+    lsize = align_up(lsize, calign);
+    if (lsize + csize > vq->arg_size)
+	return CLK_ENQUEUE_FAILURE;
+
+    int s = reserve_slot((__global uint *)vq->aql_slot_mask, vq->aql_slot_num);
+    if (s < 0)
+	return CLK_ENQUEUE_FAILURE;
+
+    __global AmdAqlWrap *a = (__global AmdAqlWrap *)(vq + 1);
+    __global void *kptr = (__global void *)((size_t)a[s].aql.kernel_arg_address + NUM_SPECIAL_ARGS*sizeof(size_t));
+    copy_captured_context(kptr, cptr, csize, calign);
+
+    *wretp = (__global void *)(a + s);
+    return CLK_SUCCESS;
+}
+
+#define GEN(N) \
+__attribute__((always_inline)) int \
+__enqueue_prep_##N(queue_t q, size_t csize, uint calign, __private void *cptr, __global void* private* private wretp) \
+{ \
+    return eprep(q, (uint)((N + NUM_SPECIAL_ARGS)*sizeof(size_t)), (uint)csize, calign, cptr, wretp); \
+}
+
+GEN(0)
+GEN(1)
+GEN(2)
+GEN(3)
+GEN(4)
+GEN(5)
+GEN(6)
+GEN(7)
+GEN(8)
+GEN(9)
+GEN(10)
+
+#endif
+

diff --git a/amd-builtins/devenq/events.cl b/amd-builtins/devenq/events.cl
new file mode 100644
index 0000000..b6d7f50
--- /dev/null
+++ b/amd-builtins/devenq/events.cl

@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "devenq.h"
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) void
+retain_event(clk_event_t e)
+{
+    __global AmdEvent *ev = (__global AmdEvent *)e;
+    atomic_fetch_add_explicit((__global atomic_uint *)&ev->counter, 1U, memory_order_acq_rel, memory_scope_device);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) void
+release_event(clk_event_t e)
+{
+    __global AmdEvent *ev = (__global AmdEvent *)e;
+    uint c = atomic_fetch_sub_explicit((__global atomic_uint *)&ev->counter, 1U, memory_order_acq_rel, memory_scope_device);
+    if (c == 1U) {
+        __global AmdVQueueHeader *vq = get_vqueue();
+        __global uint *emask = (__global uint *)vq->event_slot_mask;
+        __global AmdEvent *eb = (__global AmdEvent *)vq->event_slots;
+        uint i = ev - eb;
+        release_slot(emask, i);
+    }
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) clk_event_t
+create_user_event(void)
+{
+    __global AmdVQueueHeader *vq = get_vqueue();
+    __global uint *emask = (__global uint *)vq->event_slot_mask;
+    int i = reserve_slot(emask, vq->event_slot_num);
+
+    if (i >= 0) {
+        __global AmdEvent *ev = (__global AmdEvent *)vq->event_slots + i;
+        ev->state = CL_SUBMITTED;
+        ev->counter = 1;
+        ev->timer[0] = 0;
+        ev->timer[1] = 0;
+        ev->timer[2] = 0;
+        return (clk_event_t)ev;
+    } else
+        return (clk_event_t)(__global void *)NULL;
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) bool
+is_valid_event(clk_event_t e)
+{
+    return !((__global AmdEvent *)e == NULL);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) void
+set_user_event_status(clk_event_t e, int s)
+{
+    __global AmdEvent *ev = (__global AmdEvent *)e;
+    atomic_store_explicit((__global atomic_uint *)&ev->state, (uint)s, memory_order_release, memory_scope_device);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) void
+capture_event_profiling_info(clk_event_t e, clk_profiling_info n, __global void *p)
+{
+    // Currently the second argument must be CLK_PROFILING_COMMAND_EXEC_TIME
+    __global AmdEvent *ev = (__global AmdEvent *)e;
+    __global ulong *t = (__global ulong *)ev->timer;
+    
+    ((__global ulong *)p)[0] = t[PROFILING_COMMAND_END] - t[PROFILING_COMMAND_START];
+    ((__global ulong *)p)[1] = t[PROFILING_COMMAND_COMPLETE] - t[PROFILING_COMMAND_START];
+}
+
+#endif
+

diff --git a/amd-builtins/devenq/getkern.cl b/amd-builtins/devenq/getkern.cl
new file mode 100644
index 0000000..4dd7893
--- /dev/null
+++ b/amd-builtins/devenq/getkern.cl

@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "devenq.h"
+
+// Currently we have no information about the block with 
+// which to make block specific decisions.  Therefore these
+// library calls corresponding to all of the possible
+// get_kernel_* built in functions have no argument at all
+// and return a reasonable constant
+
+__attribute__((always_inline)) uint
+__get_kernel_work_group_size_internal(void)
+{
+    return (uint)CL_DEVICE_MAX_WORK_GROUP_SIZE;
+}
+
+__attribute__((always_inline)) uint
+__get_kernel_preferred_work_group_size_multiple_internal(void)
+{
+    return 64U;
+}
+
+#endif
+

diff --git a/amd-builtins/devenq/ndrange.cl b/amd-builtins/devenq/ndrange.cl
new file mode 100644
index 0000000..68d0f19
--- /dev/null
+++ b/amd-builtins/devenq/ndrange.cl

@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "devenq.h"
+
+// 1D variants
+
+__attribute__((overloadable, always_inline)) ndrange_t
+ndrange_1D(size_t gws)
+{
+    ndrange_t ret;
+    __private NdRange *rp = (__private NdRange *)&ret;
+    rp->dim = 1;
+    rp->goff[0] = 0;
+    rp->goff[1] = 0;
+    rp->goff[2] = 0;
+    rp->gws[0] = gws;
+    rp->gws[1] = 1;
+    rp->gws[2] = 1;
+    rp->lws[0] = min(gws, (size_t)64);
+    rp->lws[1] = 1;
+    rp->lws[2] = 1;
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) ndrange_t
+ndrange_1D(size_t gws, size_t lws)
+{
+    ndrange_t ret;
+    __private NdRange *rp = (__private NdRange *)&ret;
+    rp->dim = 1;
+    rp->goff[0] = 0;
+    rp->goff[1] = 0;
+    rp->goff[2] = 0;
+    rp->gws[0] = gws;
+    rp->gws[1] = 1;
+    rp->gws[2] = 1;
+    rp->lws[0] = lws;
+    rp->lws[1] = 1;
+    rp->lws[2] = 1;
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) ndrange_t
+ndrange_1D(size_t goff, size_t gws, size_t lws)
+{
+    ndrange_t ret;
+    __private NdRange *rp = (__private NdRange *)&ret;
+    rp->dim = 1;
+    rp->goff[0] = goff;
+    rp->goff[1] = 0;
+    rp->goff[2] = 0;
+    rp->gws[0] = gws;
+    rp->gws[1] = 1;
+    rp->gws[2] = 1;
+    rp->lws[0] = lws;
+    rp->lws[1] = 1;
+    rp->lws[2] = 1;
+    return ret;
+}
+
+// 2D variants
+
+__attribute__((overloadable, always_inline)) ndrange_t
+ndrange_2D(size_t gws[2])
+{
+    ndrange_t ret;
+    __private NdRange *rp = (__private NdRange *)&ret;
+    rp->dim = 2;
+    rp->goff[0] = 0;
+    rp->goff[1] = 0;
+    rp->goff[2] = 0;
+    rp->gws[0] = gws[0];
+    rp->gws[1] = gws[1];
+    rp->gws[2] = 1;
+    rp->lws[0] = min(gws[0], (size_t)8);
+    rp->lws[1] = min(gws[1], (size_t)8);
+    rp->lws[2] = 1;
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) ndrange_t
+ndrange_2D(size_t gws[2], size_t lws[2])
+{
+    ndrange_t ret;
+    __private NdRange *rp = (__private NdRange *)&ret;
+    rp->dim = 2;
+    rp->goff[0] = 0;
+    rp->goff[1] = 0;
+    rp->goff[2] = 0;
+    rp->gws[0] = gws[0];
+    rp->gws[1] = gws[1];
+    rp->gws[2] = 1;
+    rp->lws[0] = lws[0];
+    rp->lws[1] = lws[1];
+    rp->lws[2] = 1;
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) ndrange_t
+ndrange_2D(size_t goff[2], size_t gws[2], size_t lws[2])
+{
+    ndrange_t ret;
+    __private NdRange *rp = (__private NdRange *)&ret;
+    rp->dim = 2;
+    rp->goff[0] = goff[0];
+    rp->goff[1] = goff[1];
+    rp->goff[2] = 0;
+    rp->gws[0] = gws[0];
+    rp->gws[1] = gws[1];
+    rp->gws[2] = 1;
+    rp->lws[0] = lws[0];
+    rp->lws[1] = lws[1];
+    rp->lws[2] = 1;
+    return ret;
+}
+
+// 3D variants
+
+__attribute__((overloadable, always_inline)) ndrange_t
+ndrange_3D(size_t gws[3])
+{
+    ndrange_t ret;
+    __private NdRange *rp = (__private NdRange *)&ret;
+    rp->dim = 3;
+    rp->goff[0] = 0;
+    rp->goff[1] = 0;
+    rp->goff[2] = 0;
+    rp->gws[0] = gws[0];
+    rp->gws[1] = gws[1];
+    rp->gws[2] = gws[2];
+    rp->lws[0] = min(gws[0], (size_t)4);
+    rp->lws[1] = min(gws[1], (size_t)4);
+    rp->lws[2] = min(gws[2], (size_t)4);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) ndrange_t
+ndrange_3D(size_t gws[3], size_t lws[3])
+{
+    ndrange_t ret;
+    __private NdRange *rp = (__private NdRange *)&ret;
+    rp->dim = 3;
+    rp->goff[0] = 0;
+    rp->goff[1] = 0;
+    rp->goff[2] = 0;
+    rp->gws[0] = gws[0];
+    rp->gws[1] = gws[1];
+    rp->gws[2] = gws[2];
+    rp->lws[0] = lws[0];
+    rp->lws[1] = lws[1];
+    rp->lws[2] = lws[2];
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) ndrange_t
+ndrange_3D(size_t goff[3], size_t gws[3], size_t lws[3])
+{
+    ndrange_t ret;
+    __private NdRange *rp = (__private NdRange *)&ret;
+    rp->dim = 3;
+    rp->goff[0] = goff[0];
+    rp->goff[1] = goff[1];
+    rp->goff[2] = goff[2];
+    rp->gws[0] = gws[0];
+    rp->gws[1] = gws[1];
+    rp->gws[2] = gws[2];
+    rp->lws[0] = lws[0];
+    rp->lws[1] = lws[1];
+    rp->lws[2] = lws[2];
+    return ret;
+}
+
+#endif
+

diff --git a/amd-builtins/geom/clamp.cl b/amd-builtins/geom/clamp.cl
new file mode 100644
index 0000000..6b90960
--- /dev/null
+++ b/amd-builtins/geom/clamp.cl

@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable,weak,always_inline)) float
+clamp(float x, float minval, float maxval)
+{
+    return fmin(fmax(x, minval), maxval);
+}
+
+__attribute__((overloadable,weak,always_inline)) double
+clamp(double x, double minval, double maxval)
+{
+    // We think there is a bug in section 9.3.3 and match the float version instead
+    return fmin(fmax(x, minval), maxval);
+}
+
+// Integer clamp functions
+
+#define ICLAMP(TY) \
+__attribute__((overloadable,weak,always_inline)) TY \
+clamp(TY x, TY minval, TY maxval) \
+{ \
+    return min(max(x, minval), maxval); \
+}
+
+ICLAMP(char)
+ICLAMP(uchar)
+
+ICLAMP(short)
+ICLAMP(ushort)
+
+ICLAMP(int)
+ICLAMP(uint)
+
+ICLAMP(long)
+ICLAMP(ulong) 
+

diff --git a/amd-builtins/geom/cross.cl b/amd-builtins/geom/cross.cl
new file mode 100644
index 0000000..4af3851
--- /dev/null
+++ b/amd-builtins/geom/cross.cl

@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, weak,always_inline)) float3
+cross(float3 p0, float3 p1)
+{
+    return (float3)(p0.y * p1.z - p0.z * p1.y,
+                    p0.z * p1.x - p0.x * p1.z,
+                    p0.x * p1.y - p0.y * p1.x);
+}
+
+__attribute__((overloadable, weak,always_inline)) double3
+cross(double3 p0, double3 p1)
+{
+    return (double3)(p0.y * p1.z - p0.z * p1.y,
+                     p0.z * p1.x - p0.x * p1.z,
+                     p0.x * p1.y - p0.y * p1.x);
+}
+
+__attribute__((overloadable, weak,always_inline)) float4
+cross(float4 p0, float4 p1)
+{
+    return (float4)(p0.y * p1.z - p0.z * p1.y,
+                    p0.z * p1.x - p0.x * p1.z,
+                    p0.x * p1.y - p0.y * p1.x,
+                    p0.w * p1.w - p0.w * p1.w);
+}
+
+__attribute__((overloadable, weak,always_inline)) double4
+cross(double4 p0, double4 p1)
+{
+    return (double4)(p0.y * p1.z - p0.z * p1.y,
+                     p0.z * p1.x - p0.x * p1.z,
+                     p0.x * p1.y - p0.y * p1.x,
+                     p0.w * p1.w - p0.w * p1.w);
+}
+

diff --git a/amd-builtins/geom/degrees.cl b/amd-builtins/geom/degrees.cl
new file mode 100644
index 0000000..06ca875
--- /dev/null
+++ b/amd-builtins/geom/degrees.cl

@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, weak,always_inline)) float
+degrees(float radians)
+{
+    // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+    return 0x1.ca5dc2p+5F * radians;
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+degrees(double radians)
+{
+    // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+    return 0x1.ca5dc1a63c1f8p+5 * radians;
+}
+
+//! Converts degrees to radians, i.e. (PI / 180) * degrees.
+//
+__attribute__((overloadable, weak,always_inline)) float
+radians(float degrees)
+{
+    // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+    return 0x1.1df46ap-6F * degrees;
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+radians(double degrees)
+{
+    // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+    return 0x1.1df46a2529d39p-6 * degrees;
+}
+

diff --git a/amd-builtins/geom/distance.cl b/amd-builtins/geom/distance.cl
new file mode 100644
index 0000000..78147ac
--- /dev/null
+++ b/amd-builtins/geom/distance.cl

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, weak,always_inline)) float
+distance(float p0, float p1)
+{
+    return length(p0 - p1);
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+distance(double p0, double p1)
+{
+    return length(p0 - p1);
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+distance(float2 p0, float2 p1)
+{
+    return length(p0 - p1);
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+distance(double2 p0, double2 p1)
+{
+    return length(p0 - p1);
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+distance(float3 p0, float3 p1)
+{
+    return length(p0 - p1);
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+distance(double3 p0, double3 p1)
+{
+    return length(p0 - p1);
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+distance(float4 p0, float4 p1)
+{
+    return length(p0 - p1);
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+distance(double4 p0, double4 p1)
+{
+    return length(p0 - p1);
+}
+

diff --git a/amd-builtins/geom/dot.cl b/amd-builtins/geom/dot.cl
new file mode 100644
index 0000000..3271cbb
--- /dev/null
+++ b/amd-builtins/geom/dot.cl

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, weak, always_inline)) float
+dot(float p0, float p1)
+{
+    return p0 * p1;
+}
+
+__attribute__((overloadable, weak, always_inline)) float
+dot(float2 p0, float2 p1)
+{
+    float2 p = p0 * p1;
+    return p.x + p.y;
+}
+
+__attribute__((overloadable, weak, always_inline)) float
+dot(float3 p0, float3 p1)
+{
+    float3 p = p0 * p1;
+    return p.x + p.y + p.z;
+}
+
+__attribute__((overloadable, weak, always_inline)) float
+dot(float4 p0, float4 p1)
+{
+    float4 p = p0 * p1;
+    return p.x + p.y + p.z + p.w;
+}
+
+__attribute__((overloadable, weak, always_inline)) double
+dot(double p0, double p1)
+{
+    return p0 * p1;
+}
+
+__attribute__((overloadable, weak, always_inline)) double
+dot(double2 p0, double2 p1)
+{
+    double2 p = p0 * p1;
+    return p.x + p.y;
+}
+
+__attribute__((overloadable, weak, always_inline)) double
+dot(double3 p0, double3 p1)
+{
+    double3 p = p0 * p1;
+    return p.x + p.y + p.z;
+}
+
+__attribute__((overloadable, weak, always_inline)) double
+dot(double4 p0, double4 p1)
+{
+    double4 p = p0 * p1;
+    return p.x + p.y + p.z + p.w;
+}

diff --git a/amd-builtins/geom/fast_distance.cl b/amd-builtins/geom/fast_distance.cl
new file mode 100644
index 0000000..434451b
--- /dev/null
+++ b/amd-builtins/geom/fast_distance.cl

@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, weak,always_inline)) float
+fast_distance(float p0, float p1)
+{
+    return fast_length(p0 - p1);
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+fast_distance(float2 p0, float2 p1)
+{
+    return fast_length(p0 - p1);
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+fast_distance(float3 p0, float3 p1)
+{
+    return fast_length(p0 - p1);
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+fast_distance(float4 p0, float4 p1)
+{
+    return fast_length(p0 - p1);
+}
+

diff --git a/amd-builtins/geom/fast_length.cl b/amd-builtins/geom/fast_length.cl
new file mode 100644
index 0000000..5d2a27e
--- /dev/null
+++ b/amd-builtins/geom/fast_length.cl

@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, weak,always_inline)) float
+fast_length(float p)
+{
+    return fabs(p);
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+fast_length(float2 p)
+{
+    return half_sqrt(dot(p, p));
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+fast_length(float3 p)
+{
+    return half_sqrt(dot(p, p));
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+fast_length(float4 p)
+{
+    return half_sqrt(dot(p, p));
+}
+

diff --git a/amd-builtins/geom/fast_normalize.cl b/amd-builtins/geom/fast_normalize.cl
new file mode 100644
index 0000000..feb45b9
--- /dev/null
+++ b/amd-builtins/geom/fast_normalize.cl

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, weak,always_inline)) float
+fast_normalize(float p)
+{
+    return normalize(p);
+}
+
+__attribute__((overloadable, weak,always_inline)) float2
+fast_normalize(float2 p)
+{
+    float l2 = dot(p, p);
+    return l2 == 0.0F ? p : p * half_rsqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) float3
+fast_normalize(float3 p)
+{
+    float l2 = dot(p, p);
+    return l2 == 0.0F ? p : p * half_rsqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) float4
+fast_normalize(float4 p)
+{
+    float l2 = dot(p, p);
+    return l2 == 0.0F ? p : p * half_rsqrt(l2);
+}
+

diff --git a/amd-builtins/geom/length.cl b/amd-builtins/geom/length.cl
new file mode 100644
index 0000000..1570fc3
--- /dev/null
+++ b/amd-builtins/geom/length.cl

@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, weak,always_inline)) float
+length(float p)
+{
+    return fabs(p);
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+length(double p)
+{
+    return fabs(p);
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+length(float2 p)
+{
+    float l2 = dot(p, p);
+
+    if (l2 < FLT_MIN) {
+        p *= 0x1.0p+86F;
+        return sqrt(dot(p, p)) * 0x1.0p-86F;
+    } else if (l2 == INFINITY) {
+        p *= 0x1.0p-65F;
+        return sqrt(dot(p, p)) * 0x1.0p+65F;
+    }
+
+    return sqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+length(double2 p)
+{
+    double l2 = dot(p, p);
+
+    if (l2 < DBL_MIN) {
+        p *= 0x1.0p+563;
+        return sqrt(dot(p, p)) * 0x1.0p-563;
+    } else if (l2 == INFINITY) {
+        p *= 0x1.0p-513;
+        return sqrt(dot(p, p)) * 0x1.0p+513;
+    }
+
+    return sqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+length(float3 p)
+{
+    float l2 = dot(p, p);
+
+    if (l2 < FLT_MIN) {
+        p *= 0x1.0p+86F;
+        return sqrt(dot(p, p)) * 0x1.0p-86F;
+    } else if (l2 == INFINITY) {
+        p *= 0x1.0p-66F;
+        return sqrt(dot(p, p)) * 0x1.0p+66F;
+    }
+
+    return sqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+length(double3 p)
+{
+    double l2 = dot(p, p);
+
+    if (l2 < DBL_MIN) {
+        p *= 0x1.0p+563;
+        return sqrt(dot(p, p)) * 0x1.0p-563;
+    } else if (l2 == INFINITY) {
+        p *= 0x1.0p-514;
+        return sqrt(dot(p, p)) * 0x1.0p+514;
+    }
+
+    return sqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+length(float4 p)
+{
+    float l2 = dot(p, p);
+
+    if (l2 < FLT_MIN) {
+        p *= 0x1.0p+86F;
+        return sqrt(dot(p, p)) * 0x1.0p-86F;
+    }
+    else if (l2 == INFINITY) {
+        p *= 0x1.0p-66f;
+        return sqrt(dot(p, p)) * 0x1.0p+66F;
+    }
+
+    return sqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+length(double4 p)
+{
+    double l2 = dot(p, p);
+
+    if (l2 < DBL_MIN) {
+        p *= 0x1.0p+563;
+        return sqrt(dot(p, p)) * 0x1.0p-563;
+    }
+    else if (l2 == INFINITY) {
+        p *= 0x1.0p-514;
+        return sqrt(dot(p, p)) * 0x1.0p+514;
+    }
+
+    return sqrt(l2);
+}
+

diff --git a/amd-builtins/geom/mix.cl b/amd-builtins/geom/mix.cl
new file mode 100644
index 0000000..0ac7f9b
--- /dev/null
+++ b/amd-builtins/geom/mix.cl

@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+//extern __attribute__((pure)) float  __amdil_lerp_f32(float,  float,  float);
+
+// TODO_HSA: Validate that fma works for mix
+__attribute__((overloadable, weak,always_inline)) float
+mix(float x, float y, float a)
+{
+    //return __amdil_lerp_f32(a, y, x);
+    return fma(a, (y - x), x);
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+mix(double x, double y, double a)
+{
+    //return x + (y - x) * a;
+    return fma(a, (y - x), x);
+}
+

diff --git a/amd-builtins/geom/normalize.cl b/amd-builtins/geom/normalize.cl
new file mode 100644
index 0000000..774892b
--- /dev/null
+++ b/amd-builtins/geom/normalize.cl

@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, weak,always_inline)) float
+normalize(float p)
+{
+    return sign(p);
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+normalize(double p)
+{
+    return sign(p);
+}
+
+__attribute__((overloadable, weak,always_inline)) float2
+normalize(float2 p)
+{
+    if (all(p == (float2)0.0F))
+	return p;
+
+    float l2 = dot(p, p);
+
+    if (l2 < FLT_MIN) {
+        p *= 0x1.0p+86F;
+        l2 = dot(p, p);
+    } else if (l2 == INFINITY) {
+        p *= 0x1.0p-65f;
+        l2 = dot(p, p);
+        if (l2 == INFINITY) {
+            p = copysign(select((float2)0.0F, (float2)1.0F, isinf(p)), p);
+            l2 = dot(p, p);
+        }
+    }
+    return p * rsqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) double2
+normalize(double2 p)
+{
+    if (all(p == (double2)0.0))
+	return p;
+
+    double l2 = dot(p, p);
+
+    if (l2 < DBL_MIN) {
+        p *= 0x1.0p+563;
+        l2 = dot(p, p);
+    } else if (l2 == INFINITY) {
+        p *= 0x1.0p-513;
+        l2 = dot(p, p);
+        if (l2 == INFINITY) {
+            p = copysign(select((double2)0.0, (double2)1.0, isinf(p)), p);
+            l2 = dot(p, p);
+        }
+    }
+    return p * rsqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) float3
+normalize(float3 p)
+{
+    if (all(p == (float3)0.0F))
+	return p;
+
+    float l2 = dot(p, p);
+
+    if (l2 < FLT_MIN) {
+        p *= 0x1.0p+86F;
+        l2 = dot(p, p);
+    } else if (l2 == INFINITY) {
+        p *= 0x1.0p-66f;
+        l2 = dot(p, p);
+        if (l2 == INFINITY) {
+            p = copysign(select((float3)0.0F, (float3)1.0F, isinf(p)), p);
+            l2 = dot(p, p);
+        }
+    }
+    return p * rsqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) double3
+normalize(double3 p)
+{
+    if (all(p == (double3)0.0))
+	return p;
+
+    double l2 = dot(p, p);
+
+    if (l2 < DBL_MIN) {
+        p *= 0x1.0p+563;
+        l2 = dot(p, p);
+    } else if (l2 == INFINITY) {
+        p *= 0x1.0p-514;
+        l2 = dot(p, p);
+        if (l2 == INFINITY) {
+            p = copysign(select((double3)0.0, (double3)1.0, isinf(p)), p);
+            l2 = dot(p, p);
+        }
+    }
+    return p * rsqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) float4
+normalize(float4 p)
+{
+    if (all(p == (float4)0.0F))
+	return p;
+
+    float l2 = dot(p, p);
+
+    if (l2 < FLT_MIN) {
+        p *= 0x1.0p+86F;
+        l2 = dot(p, p);
+    } else if (l2 == INFINITY) {
+        p *= 0x1.0p-66f;
+        l2 = dot(p, p);
+        if (l2 == INFINITY) {
+            p = copysign(select((float4)0.0F, (float4)1.0F, isinf(p)), p);
+            l2 = dot(p, p);
+        }
+    }
+    return p * rsqrt(l2);
+}
+
+__attribute__((overloadable, weak,always_inline)) double4
+normalize(double4 p)
+{
+    if (all(p == (double4)0.0))
+	return p;
+
+    double l2 = dot(p, p);
+
+    if (l2 < DBL_MIN) {
+        p *= 0x1.0p+563;
+        l2 = dot(p, p);
+    } else if (l2 == INFINITY) {
+        p *= 0x1.0p-514;
+        l2 = dot(p, p);
+        if (l2 == INFINITY) {
+            p = copysign(select((double4)0.0, (double4)1.0, isinf(p)), p);
+            l2 = dot(p, p);
+        }
+    }
+    return p * rsqrt(l2);
+}
+

diff --git a/amd-builtins/geom/sign.cl b/amd-builtins/geom/sign.cl
new file mode 100644
index 0000000..db5c692
--- /dev/null
+++ b/amd-builtins/geom/sign.cl

@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define G(T) \
+__attribute__((overloadable, weak, always_inline)) T \
+sign(T x) \
+{ \
+    return copysign(x == (T)0 | isnan(x) ? (T)0 : (T)1, x); \
+}
+
+G(float)
+// TODO_HSA: resolve vector expansions
+//G(float2)
+//G(float3)
+//G(float4)
+
+G(double)
+

diff --git a/amd-builtins/geom/step.cl b/amd-builtins/geom/step.cl
new file mode 100644
index 0000000..dcff4df
--- /dev/null
+++ b/amd-builtins/geom/step.cl

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, weak,always_inline)) float
+step(float edge, float x)
+{
+    return x < edge ? 0.0F: 1.0F;
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+step(double edge, double x)
+{
+    return x < edge ? 0.0: 1.0;
+}
+
+__attribute__((overloadable, weak,always_inline)) float
+smoothstep(float edge0, float edge1, float x)
+{
+    float t = clamp((x - edge0) / (edge1 - edge0), 0.0F, 1.0F);
+    return t * t * (3.0F - 2.0F * t);
+}
+
+__attribute__((overloadable, weak,always_inline)) double
+smoothstep(double edge0, double edge1, double x)
+{
+    double t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0);
+    return t * t * (3.0 - 2.0 * t);
+}
+

diff --git a/amd-builtins/image/get.cl b/amd-builtins/image/get.cl
new file mode 100644
index 0000000..ba62507
--- /dev/null
+++ b/amd-builtins/image/get.cl

@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+// Image query built-ins
+
+#if __OPENCL_C_VERSION__ >= 200
+#define CLK_UNORM_INT24              0x10DF
+// BRIG enum should match the one in \compiler\hsail-tools\libHSAIL\Brig_new.hpp
+// TODO : We need to have a single file header with those enums shared across components
+enum BrigImageChannelOrder {
+    //.mnemo={ s/^BRIG_CHANNEL_ORDER_?//;lc }
+    //.mnemo_token=EImageOrder
+    //.mnemo_context=EImageOrderContext
+    BRIG_CHANNEL_ORDER_A             = 0,
+    BRIG_CHANNEL_ORDER_R             = 1,
+    BRIG_CHANNEL_ORDER_RX            = 2,
+    BRIG_CHANNEL_ORDER_RG            = 3,
+    BRIG_CHANNEL_ORDER_RGX           = 4,
+    BRIG_CHANNEL_ORDER_RA            = 5,
+    BRIG_CHANNEL_ORDER_RGB           = 6,
+    BRIG_CHANNEL_ORDER_RGBX          = 7,
+    BRIG_CHANNEL_ORDER_RGBA          = 8,
+    BRIG_CHANNEL_ORDER_BGRA          = 9,
+    BRIG_CHANNEL_ORDER_ARGB          = 10,
+    BRIG_CHANNEL_ORDER_ABGR          = 11,
+    BRIG_CHANNEL_ORDER_SRGB          = 12,
+    BRIG_CHANNEL_ORDER_SRGBX         = 13,
+    BRIG_CHANNEL_ORDER_SRGBA         = 14,
+    BRIG_CHANNEL_ORDER_SBGRA         = 15,
+    BRIG_CHANNEL_ORDER_INTENSITY     = 16,
+    BRIG_CHANNEL_ORDER_LUMINANCE     = 17,
+    BRIG_CHANNEL_ORDER_DEPTH         = 18,
+    BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19
+};
+
+enum BrigImageChannelType {
+    //.mnemo={ s/^BRIG_CHANNEL_TYPE_//;lc }
+    //.mnemo_token=EImageFormat
+    BRIG_CHANNEL_TYPE_SNORM_INT8         = 0,
+    BRIG_CHANNEL_TYPE_SNORM_INT16        = 1,
+    BRIG_CHANNEL_TYPE_UNORM_INT8         = 2,
+    BRIG_CHANNEL_TYPE_UNORM_INT16        = 3,
+    BRIG_CHANNEL_TYPE_UNORM_INT24        = 4,
+    BRIG_CHANNEL_TYPE_UNORM_SHORT_555    = 5,
+    BRIG_CHANNEL_TYPE_UNORM_SHORT_565    = 6,
+    BRIG_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
+    BRIG_CHANNEL_TYPE_SIGNED_INT8        = 8,
+    BRIG_CHANNEL_TYPE_SIGNED_INT16       = 9,
+    BRIG_CHANNEL_TYPE_SIGNED_INT32       = 10,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT8      = 11,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT16     = 12,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT32     = 13,
+    BRIG_CHANNEL_TYPE_HALF_FLOAT         = 14,
+    BRIG_CHANNEL_TYPE_FLOAT              = 15
+};
+#endif
+
+// Hsail image query intrinsics
+extern __attribute__((pure)) int __hsail_query_width_1d(image1d_t);
+extern __attribute__((pure)) int __hsail_query_width_1db(image1d_buffer_t);
+extern __attribute__((pure)) int __hsail_query_width_1da(image1d_array_t);
+extern __attribute__((pure)) int __hsail_query_width_2d(image2d_t);
+extern __attribute__((pure)) int __hsail_query_width_2da(image2d_array_t);
+extern __attribute__((pure)) int __hsail_query_width_3d(image3d_t);
+
+extern __attribute__((pure)) int __hsail_query_height_2d(image2d_t);
+extern __attribute__((pure)) int __hsail_query_height_2da(image2d_array_t);
+extern __attribute__((pure)) int __hsail_query_height_3d(image3d_t);
+
+extern __attribute__((pure)) int __hsail_depth_3d(image3d_t);
+
+extern __attribute__((pure)) int __hsail_query_format_1d(image1d_t);
+extern __attribute__((pure)) int __hsail_query_format_1db(image1d_buffer_t);
+extern __attribute__((pure)) int __hsail_query_format_1da(image1d_array_t);
+extern __attribute__((pure)) int __hsail_query_format_2d(image2d_t);
+extern __attribute__((pure)) int __hsail_query_format_2da(image2d_array_t);
+extern __attribute__((pure)) int __hsail_query_format_3d(image3d_t);
+
+extern __attribute__((pure)) int __hsail_query_order_1d(image1d_t);
+extern __attribute__((pure)) int __hsail_query_order_1db(image1d_buffer_t);
+extern __attribute__((pure)) int __hsail_query_order_1da(image1d_array_t);
+extern __attribute__((pure)) int __hsail_query_order_2d(image2d_t);
+extern __attribute__((pure)) int __hsail_query_order_2da(image2d_array_t);
+extern __attribute__((pure)) int __hsail_query_order_3d(image3d_t);
+
+extern __attribute__((pure)) uint __hsail_query_array_1da(image1d_array_t);
+extern __attribute__((pure)) uint __hsail_query_array_2da(image2d_array_t);
+
+
+#define DefQueryImage(Func,HsailIntrin,ImageTy,RetTy) \
+__attribute__((overloadable, always_inline)) RetTy  \
+Func(ImageTy image) { \
+  return (RetTy)HsailIntrin(image);  \
+}
+
+#if __OPENCL_C_VERSION__ >= 200
+#define DefQueryImageChOrder(Func,HsailIntrin,ImageTy,RetTy) \
+__attribute__((overloadable, always_inline)) RetTy  \
+Func(ImageTy image) { \
+  uint Chorder = (RetTy)HsailIntrin(image);  \
+  return (mapBRIGChOrderToOCLChOrder(Chorder)); \
+}
+
+#define DefQueryImageChType(Func,HsailIntrin,ImageTy,RetTy) \
+__attribute__((overloadable, always_inline)) RetTy  \
+Func(ImageTy image) { \
+  uint Chtype = (RetTy)HsailIntrin(image);  \
+  return (mapBRIGChTypeToOCLChType(Chtype)); \
+}
+
+static inline uint mapBRIGChOrderToOCLChOrder(uint BRIGChOrder) {
+  uint chorder;
+  switch (BRIGChOrder) {
+    case BRIG_CHANNEL_ORDER_A: chorder = CLK_A; break;
+    case BRIG_CHANNEL_ORDER_R: chorder = CLK_R; break;
+    case BRIG_CHANNEL_ORDER_RX: chorder = CLK_Rx; break;
+    case BRIG_CHANNEL_ORDER_RG: chorder = CLK_RG; break;
+    case BRIG_CHANNEL_ORDER_RGX: chorder = CLK_RGx; break;
+    case BRIG_CHANNEL_ORDER_RA: chorder = CLK_RA; break;
+    case BRIG_CHANNEL_ORDER_RGB: chorder = CLK_RGB; break;
+    case BRIG_CHANNEL_ORDER_RGBX: chorder = CLK_RGBx; break;
+    case BRIG_CHANNEL_ORDER_RGBA: chorder = CLK_RGBA; break;
+    case BRIG_CHANNEL_ORDER_BGRA: chorder = CLK_BGRA; break;
+    case BRIG_CHANNEL_ORDER_ARGB: chorder = CLK_ARGB; break;
+    case BRIG_CHANNEL_ORDER_SRGB: chorder = CLK_sRGB; break;
+    case BRIG_CHANNEL_ORDER_SRGBX: chorder = CLK_sRGBx; break;
+    case BRIG_CHANNEL_ORDER_SRGBA: chorder = CLK_sRGBA; break;
+    case BRIG_CHANNEL_ORDER_SBGRA: chorder = CLK_sBGRA; break;
+    case BRIG_CHANNEL_ORDER_INTENSITY: chorder = CLK_INTENSITY; break;
+    case BRIG_CHANNEL_ORDER_LUMINANCE: chorder = CLK_LUMINANCE; break;
+    case BRIG_CHANNEL_ORDER_DEPTH: chorder = CLK_DEPTH; break;
+    case BRIG_CHANNEL_ORDER_DEPTH_STENCIL: chorder = CLK_DEPTH_STENCIL; break;
+  }
+  return chorder;
+}
+
+static inline uint mapBRIGChTypeToOCLChType(uint BRIGChType) {
+  uint chtype;
+  switch (BRIGChType) {
+    case BRIG_CHANNEL_TYPE_SNORM_INT8: chtype = CLK_SNORM_INT8; break;
+    case BRIG_CHANNEL_TYPE_SNORM_INT16: chtype = CLK_SNORM_INT16; break;
+    case BRIG_CHANNEL_TYPE_UNORM_INT8: chtype = CLK_UNORM_INT8; break;
+    case BRIG_CHANNEL_TYPE_UNORM_INT16: chtype = CLK_UNORM_INT16; break;
+    case BRIG_CHANNEL_TYPE_UNORM_INT24: chtype = CLK_UNORM_INT24; break;
+    case BRIG_CHANNEL_TYPE_UNORM_SHORT_555: chtype = CLK_UNORM_SHORT_555; break;
+    case BRIG_CHANNEL_TYPE_UNORM_SHORT_565: chtype = CLK_UNORM_SHORT_565; break;
+    // Todo: Need to change *_UNORM_SHORT_101010 to *_UNORM_INT_101010 once BRIG enum will change
+    case BRIG_CHANNEL_TYPE_UNORM_SHORT_101010: chtype = CLK_UNORM_INT_101010; break;
+    case BRIG_CHANNEL_TYPE_SIGNED_INT8: chtype = CLK_SIGNED_INT8; break;
+    case BRIG_CHANNEL_TYPE_SIGNED_INT16: chtype = CLK_SIGNED_INT16; break;
+    case BRIG_CHANNEL_TYPE_SIGNED_INT32: chtype = CLK_SIGNED_INT32; break;
+    case BRIG_CHANNEL_TYPE_UNSIGNED_INT8: chtype = CLK_UNSIGNED_INT8; break;
+    case BRIG_CHANNEL_TYPE_UNSIGNED_INT16: chtype = CLK_UNSIGNED_INT16; break;
+    case BRIG_CHANNEL_TYPE_UNSIGNED_INT32: chtype = CLK_UNSIGNED_INT32; break;
+    case BRIG_CHANNEL_TYPE_HALF_FLOAT: chtype = CLK_HALF_FLOAT; break;
+    case BRIG_CHANNEL_TYPE_FLOAT: chtype = CLK_FLOAT; break;
+  }
+  return chtype;
+}
+#endif
+
+DefQueryImage(get_image_width, __hsail_query_width_1d, image1d_t, int)
+DefQueryImage(get_image_width, __hsail_query_width_1db, image1d_buffer_t, int)
+DefQueryImage(get_image_width, __hsail_query_width_1da, image1d_array_t, int)
+DefQueryImage(get_image_width, __hsail_query_width_2d, image2d_t, int)
+DefQueryImage(get_image_width, __hsail_query_width_2da, image2d_array_t, int)
+DefQueryImage(get_image_width, __hsail_query_width_3d, image3d_t, int)
+
+DefQueryImage(get_image_height, __hsail_query_height_2d, image2d_t, int)
+DefQueryImage(get_image_height, __hsail_query_height_2da, image2d_array_t, int)
+DefQueryImage(get_image_height, __hsail_query_height_3d, image3d_t, int)
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) int
+get_image_depth(image3d_t image) {
+  return __hsail_depth_3d(image);
+}
+
+#if __OPENCL_C_VERSION__ >= 200
+DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_1d, image1d_t, int)
+DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_1db, image1d_buffer_t, int)
+DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_1da, image1d_array_t, int)
+DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_2d, image2d_t, int)
+DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_2da, image2d_array_t, int)
+DefQueryImageChType(get_image_channel_data_type, __hsail_query_format_3d, image3d_t, int)
+
+DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_1d, image1d_t, int)
+DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_1db, image1d_buffer_t, int)
+DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_1da, image1d_array_t, int)
+DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_2d, image2d_t, int)
+DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_2da, image2d_array_t, int)
+DefQueryImageChOrder(get_image_channel_order, __hsail_query_order_3d, image3d_t, int)
+
+#else
+DefQueryImage(get_image_channel_data_type, __hsail_query_format_1d, image1d_t, int)
+DefQueryImage(get_image_channel_data_type, __hsail_query_format_1db, image1d_buffer_t, int)
+DefQueryImage(get_image_channel_data_type, __hsail_query_format_1da, image1d_array_t, int)
+DefQueryImage(get_image_channel_data_type, __hsail_query_format_2d, image2d_t, int)
+DefQueryImage(get_image_channel_data_type, __hsail_query_format_2da, image2d_array_t, int)
+DefQueryImage(get_image_channel_data_type, __hsail_query_format_3d, image3d_t, int)
+
+DefQueryImage(get_image_channel_order, __hsail_query_order_1d, image1d_t, int)
+DefQueryImage(get_image_channel_order, __hsail_query_order_1db, image1d_buffer_t, int)
+DefQueryImage(get_image_channel_order, __hsail_query_order_1da, image1d_array_t, int)
+DefQueryImage(get_image_channel_order, __hsail_query_order_2d, image2d_t, int)
+DefQueryImage(get_image_channel_order, __hsail_query_order_2da, image2d_array_t, int)
+DefQueryImage(get_image_channel_order, __hsail_query_order_3d, image3d_t, int)
+#endif
+
+__attribute__((overloadable, always_inline)) int2
+get_image_dim(image2d_t image) {
+  int2 dim;
+  dim.x = get_image_width(image);
+  dim.y = get_image_height(image);
+  return dim;
+}
+
+__attribute__((overloadable, always_inline)) int2
+get_image_dim(image2d_array_t image) {
+  int2 dim;
+  dim.x = get_image_width(image);
+  dim.y = get_image_height(image);
+  return dim;
+}
+
+__attribute__((overloadable, always_inline)) int4
+get_image_dim(image3d_t image) {
+  int4 dim;
+  dim.x = get_image_width(image);
+  dim.y = get_image_height(image);
+  dim.z = get_image_depth(image);
+  dim.w = 0;
+  return dim;
+}
+
+DefQueryImage(get_image_array_size, __hsail_query_array_1da, image1d_array_t, size_t);
+DefQueryImage(get_image_array_size, __hsail_query_array_2da, image2d_array_t, size_t);
+
+#if __OPENCL_C_VERSION__ >= 200
+// Image-2.0 query built-ins
+
+// Hsail image query intrinsics
+extern __attribute__((pure)) int __hsail_query_width_2ddepth(image2d_depth_t);
+extern __attribute__((pure)) int __hsail_query_width_2dadepth(image2d_array_depth_t);
+
+extern __attribute__((pure)) int __hsail_query_height_2ddepth(image2d_depth_t);
+extern __attribute__((pure)) int __hsail_query_height_2dadepth(image2d_array_depth_t);
+
+extern __attribute__((pure)) int __hsail_query_array_2dadepth(image2d_array_depth_t);
+
+extern __attribute__((pure)) int __hsail_query_channelorder_2ddepth(image2d_depth_t);
+extern __attribute__((pure)) int __hsail_query_channelorder_2dadepth(image2d_array_depth_t);
+
+extern __attribute__((pure)) int __hsail_query_channeltype_2ddepth(image2d_depth_t);
+extern __attribute__((pure)) int __hsail_query_channeltype_2dadepth(image2d_array_depth_t);
+
+DefQueryImage(get_image_width, __hsail_query_width_2ddepth, image2d_depth_t, int)
+DefQueryImage(get_image_width, __hsail_query_width_2dadepth, image2d_array_depth_t, int)
+
+DefQueryImage(get_image_height, __hsail_query_height_2ddepth, image2d_depth_t, int)
+DefQueryImage(get_image_height, __hsail_query_height_2dadepth, image2d_array_depth_t, int)
+
+DefQueryImageChType(get_image_channel_data_type, __hsail_query_channeltype_2ddepth, image2d_depth_t, int)
+DefQueryImageChType(get_image_channel_data_type, __hsail_query_channeltype_2dadepth, image2d_array_depth_t, int)
+
+DefQueryImageChOrder(get_image_channel_order, __hsail_query_channelorder_2ddepth, image2d_depth_t, int)
+DefQueryImageChOrder(get_image_channel_order, __hsail_query_channelorder_2dadepth, image2d_array_depth_t, int)
+
+
+__attribute__((overloadable, always_inline)) int2
+get_image_dim(image2d_depth_t image) {
+  int2 dim;
+  dim.x = get_image_width(image);
+  dim.y = get_image_height(image);
+  return dim;
+}
+__attribute__((overloadable, always_inline)) int2
+get_image_dim(image2d_array_depth_t image) {
+  int2 dim;
+  dim.x = get_image_width(image);
+  dim.y = get_image_height(image);
+  return dim;
+}
+
+DefQueryImage(get_image_array_size, __hsail_query_array_2dadepth, image2d_array_depth_t, size_t)
+
+#endif // __OPENCL_C_VERSION__ >= 200

diff --git a/amd-builtins/image/read.cl b/amd-builtins/image/read.cl
new file mode 100644
index 0000000..0f7ae0b
--- /dev/null
+++ b/amd-builtins/image/read.cl

@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// Image read built-ins
+
+struct pixel_data_f32 {
+  float x;
+  float y;
+  float z;
+  float w;
+};
+
+struct pixel_data_s32 {
+  int x;
+  int y;
+  int z;
+  int w;
+};
+
+struct pixel_data_u32 {
+  uint x;
+  uint y;
+  uint z;
+  uint w;
+};
+
+// Read Image 1d
+
+#define DefReadImage1D(Func, HsailIntrin, DstTy, CoordTy, DstSuf)       \
+  extern struct pixel_data##DstSuf                                      \
+  HsailIntrin(image1d_t, sampler_t, CoordTy);                           \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy##4                 \
+  Func(image1d_t image, sampler_t sampler, CoordTy coord) {             \
+    struct pixel_data##DstSuf                                           \
+      data = HsailIntrin(image, sampler, coord);                        \
+    return (DstTy##4)(data.x, data.y, data.z, data.w);                  \
+  }
+
+DefReadImage1D(read_imagef, __hsail_rdimagef_1d_s32, float, int, _f32)
+DefReadImage1D(read_imagef, __hsail_rdimagef_1d_f32, float, float, _f32)
+DefReadImage1D(read_imagei, __hsail_rdimagei_1d_s32, int, int, _s32)
+DefReadImage1D(read_imagei, __hsail_rdimagei_1d_f32, int, float, _s32)
+DefReadImage1D(read_imageui, __hsail_rdimageui_1d_s32, uint, int, _u32)
+DefReadImage1D(read_imageui, __hsail_rdimageui_1d_f32, uint, float, _u32)
+
+// Read Image 1d Array
+
+#define DefReadImage1DArray(Func, HsailIntrin, DstTy, CoordTy, DstSuf)  \
+  extern struct pixel_data##DstSuf                                      \
+  HsailIntrin(image1d_array_t, sampler_t, CoordTy, CoordTy);            \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy##4                 \
+  Func(image1d_array_t image, sampler_t sampler, CoordTy##2 coords) {   \
+    struct pixel_data##DstSuf                                           \
+      data = HsailIntrin(image, sampler, coords.x, coords.y);           \
+    return (DstTy##4)(data.x, data.y, data.z, data.w);                  \
+  }
+
+DefReadImage1DArray(read_imagef, __hsail_rdimagef_1da_s32, float, int, _f32)
+DefReadImage1DArray(read_imagef, __hsail_rdimagef_1da_f32, float, float, _f32)
+DefReadImage1DArray(read_imagei, __hsail_rdimagei_1da_s32, int, int, _s32)
+DefReadImage1DArray(read_imagei, __hsail_rdimagei_1da_f32, int, float, _s32)
+DefReadImage1DArray(read_imageui, __hsail_rdimageui_1da_s32, uint, int, _u32)
+DefReadImage1DArray(read_imageui, __hsail_rdimageui_1da_f32, uint, float, _u32)
+
+// Read Image 2d 
+
+#define DefReadImage2D(Func, HsailIntrin, DstTy, CoordTy, DstSuf)       \
+  extern struct pixel_data##DstSuf                                      \
+  HsailIntrin(image2d_t, sampler_t, CoordTy, CoordTy);                  \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy##4                 \
+  Func(image2d_t image, sampler_t sampler, CoordTy##2 coords) {         \
+    struct pixel_data##DstSuf                                           \
+      data = HsailIntrin(image, sampler, coords.x, coords.y);           \
+    return (DstTy##4)(data.x, data.y, data.z, data.w);                  \
+}
+
+DefReadImage2D(read_imagef, __hsail_rdimagef_2d_s32, float, int, _f32)
+DefReadImage2D(read_imagef, __hsail_rdimagef_2d_f32, float, float, _f32)
+DefReadImage2D(read_imagei, __hsail_rdimagei_2d_s32, int, int, _s32)
+DefReadImage2D(read_imagei, __hsail_rdimagei_2d_f32, int, float, _s32)
+DefReadImage2D(read_imageui, __hsail_rdimageui_2d_s32, uint, int, _u32)
+DefReadImage2D(read_imageui, __hsail_rdimageui_2d_f32, uint, float, _u32)
+
+// Read Image 2d Array
+
+#define DefReadImage2DArray(Func, HsailIntrin, DstTy, CoordTy, DstSuf)  \
+  extern struct pixel_data##DstSuf                                      \
+  HsailIntrin(image2d_array_t, sampler_t, CoordTy, CoordTy, CoordTy);   \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy##4                 \
+  Func(image2d_array_t image, sampler_t sampler, CoordTy##4 coords) {   \
+    struct pixel_data##DstSuf                                           \
+      data = HsailIntrin(image, sampler, coords.x, coords.y, coords.z); \
+    return (DstTy##4)(data.x, data.y, data.z, data.w);                  \
+  }
+
+DefReadImage2DArray(read_imagef, __hsail_rdimagef_2da_s32, float, int, _f32)
+DefReadImage2DArray(read_imagef, __hsail_rdimagef_2da_f32, float, float, _f32)
+DefReadImage2DArray(read_imagei, __hsail_rdimagei_2da_s32, int, int, _s32)
+DefReadImage2DArray(read_imagei, __hsail_rdimagei_2da_f32, int, float, _s32)
+DefReadImage2DArray(read_imageui, __hsail_rdimageui_2da_s32, uint, int, _u32)
+DefReadImage2DArray(read_imageui, __hsail_rdimageui_2da_f32, uint, float, _u32)
+
+// Read Image 3d
+
+#define DefReadImage3D(Func, HsailIntrin, DstTy, CoordTy, DstSuf)       \
+  extern struct pixel_data##DstSuf                                      \
+  HsailIntrin(image3d_t, sampler_t, CoordTy, CoordTy, CoordTy);         \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy##4                 \
+  Func(image3d_t image, sampler_t sampler, CoordTy##4 coords) {         \
+    struct pixel_data##DstSuf                                           \
+      data = HsailIntrin(image, sampler, coords.x, coords.y, coords.z); \
+    return (DstTy##4)(data.x, data.y, data.z, data.w);                  \
+  }
+
+DefReadImage3D(read_imagef, __hsail_rdimagef_3d_s32, float, int, _f32)
+DefReadImage3D(read_imagef, __hsail_rdimagef_3d_f32, float, float, _f32)
+DefReadImage3D(read_imagei, __hsail_rdimagei_3d_s32, int, int, _s32)
+DefReadImage3D(read_imagei, __hsail_rdimagei_3d_f32, int, float, _s32)
+DefReadImage3D(read_imageui, __hsail_rdimageui_3d_s32, uint, int, _u32)
+DefReadImage3D(read_imageui, __hsail_rdimageui_3d_f32, uint, float, _u32)
+
+// Sampler-less Read Image 1d
+
+#define DefLoadImage1D(Func, HsailIntrin, DstTy, CoordTy, DstSuf)       \
+  extern struct pixel_data##DstSuf                                      \
+  HsailIntrin(image1d_t, CoordTy);                                      \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy##4                 \
+  Func(image1d_t image, CoordTy coord) {                                \
+    struct pixel_data##DstSuf                                           \
+      data = HsailIntrin(image, coord);                                 \
+    return (DstTy##4)(data.x, data.y, data.z, data.w);                  \
+  }
+
+DefLoadImage1D(read_imagef, __hsail_ldimagef_1d_u32, float, int, _f32)
+DefLoadImage1D(read_imagei, __hsail_ldimagei_1d_u32, int, int, _s32)
+DefLoadImage1D(read_imageui, __hsail_ldimageui_1d_u32, uint, int, _u32)
+
+// Sampler-less Read Image 1d buffer
+
+#define DefLoadImage1DBuffer(Func, HsailIntrin, DstTy, CoordTy, DstSuf) \
+  extern struct pixel_data##DstSuf                                      \
+  HsailIntrin(image1d_buffer_t, CoordTy);                               \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy##4                 \
+  Func(image1d_buffer_t image, CoordTy coord) {                         \
+    struct pixel_data##DstSuf                                           \
+      data = HsailIntrin(image, coord);                                 \
+    return (DstTy##4)(data.x, data.y, data.z, data.w);                  \
+  }
+
+DefLoadImage1DBuffer(read_imagef, __hsail_ldimagef_1db_u32, float, int, _f32)
+DefLoadImage1DBuffer(read_imagei, __hsail_ldimagei_1db_u32, int, int, _s32)
+DefLoadImage1DBuffer(read_imageui, __hsail_ldimageui_1db_u32, uint, int, _u32)
+
+// Sampler-less Read Image 1d Array
+
+#define DefLoadImage1DArray(Func, HsailIntrin, DstTy, CoordTy, DstSuf)  \
+  extern struct pixel_data##DstSuf                                      \
+  HsailIntrin(image1d_array_t, CoordTy, CoordTy);                       \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy##4                 \
+  Func(image1d_array_t image, CoordTy##2 coords) {                      \
+    struct pixel_data##DstSuf                                           \
+      data = HsailIntrin(image, coords.x, coords.y);                    \
+    return (DstTy##4)(data.x, data.y, data.z, data.w);                  \
+  }
+
+DefLoadImage1DArray(read_imagef, __hsail_ldimagef_1da_u32, float, int, _f32)
+DefLoadImage1DArray(read_imagei, __hsail_ldimagei_1da_u32, int, int, _s32)
+DefLoadImage1DArray(read_imageui, __hsail_ldimageui_1da_u32, uint, int, _u32)
+
+// Sampler-less Read Image 2d
+
+#define DefLoadImage2D(Func, HsailIntrin, DstTy, CoordTy, DstSuf)       \
+  extern struct pixel_data##DstSuf                                      \
+  HsailIntrin(image2d_t, CoordTy, CoordTy);                             \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy##4                 \
+  Func(image2d_t image, CoordTy##2 coords) {                            \
+    struct pixel_data##DstSuf                                           \
+      data = HsailIntrin(image, coords.x, coords.y);                    \
+    return (DstTy##4)(data.x, data.y, data.z, data.w);                  \
+  }
+
+DefLoadImage2D(read_imagef, __hsail_ldimagef_2d_u32, float, int, _f32)
+DefLoadImage2D(read_imagei, __hsail_ldimagei_2d_u32, int, int, _s32)
+DefLoadImage2D(read_imageui, __hsail_ldimageui_2d_u32, uint, int, _u32)
+
+// Sampler-less Read Image 2d array
+
+#define DefLoadImage2DArray(Func, HsailIntrin, DstTy, CoordTy, DstSuf)  \
+  extern struct pixel_data##DstSuf                                      \
+  HsailIntrin(image2d_array_t, CoordTy, CoordTy, CoordTy);              \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy##4                 \
+  Func(image2d_array_t image, CoordTy##4 coords) {                      \
+    struct pixel_data##DstSuf                                           \
+      data = HsailIntrin(image, coords.x, coords.y, coords.z);          \
+    return (DstTy##4)(data.x, data.y, data.z, data.w);                  \
+  }
+
+DefLoadImage2DArray(read_imagef, __hsail_ldimagef_2da_u32, float, int, _f32)
+DefLoadImage2DArray(read_imagei, __hsail_ldimagei_2da_u32, int, int, _s32)
+DefLoadImage2DArray(read_imageui, __hsail_ldimageui_2da_u32, uint, int, _u32)
+
+// Sampler-less Read Image 3d
+
+#define DefLoadImage3D(Func, HsailIntrin, DstTy, CoordTy, DstSuf)  \
+  extern struct pixel_data##DstSuf                                      \
+  HsailIntrin(image3d_t, CoordTy, CoordTy, CoordTy);                    \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy##4                 \
+  Func(image3d_t image, CoordTy##4 coords) {                            \
+    struct pixel_data##DstSuf                                           \
+      data = HsailIntrin(image, coords.x, coords.y, coords.z);          \
+    return (DstTy##4)(data.x, data.y, data.z, data.w);                  \
+  }
+
+DefLoadImage3D(read_imagef, __hsail_ldimagef_3d_u32, float, int, _f32)
+DefLoadImage3D(read_imagei, __hsail_ldimagei_3d_u32, int, int, _s32)
+DefLoadImage3D(read_imageui, __hsail_ldimageui_3d_u32, uint, int, _u32)
+
+#if __OPENCL_C_VERSION__ >= 200
+// Image-2.0 read built-ins
+
+// Read Image 2d depth
+#define DefReadImage2DDepth(Func, HsailIntrin, DstTy, CoordTy)          \
+  float                                                                 \
+   HsailIntrin(image2d_depth_t, sampler_t, CoordTy, CoordTy);           \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy                    \
+  Func(image2d_depth_t image, sampler_t sampler, CoordTy##2 coords) {   \
+    float                                                               \
+      data = HsailIntrin(image, sampler, coords.x, coords.y);           \
+    return data;                                                        \
+}
+
+DefReadImage2DDepth(read_imagef, __hsail_rdimagef_2ddepth_s32, float, int)
+DefReadImage2DDepth(read_imagef, __hsail_rdimagef_2ddepth_f32, float, float)
+
+// Read Image 2d array depth
+
+#define DefReadImage2DArrayDepth(Func, HsailIntrin, DstTy, CoordTy)                  \
+  float                                                                              \
+   HsailIntrin(image2d_array_depth_t, sampler_t, CoordTy, CoordTy, CoordTy);         \
+                                                                                     \
+  __attribute__((overloadable, always_inline)) DstTy                                 \
+  Func(image2d_array_depth_t image, sampler_t sampler, CoordTy##4 coords) {          \
+    float                                                                            \
+      data = HsailIntrin(image, sampler, coords.x, coords.y, coords.z);              \
+    return data;                                                                     \
+}
+
+DefReadImage2DArrayDepth(read_imagef, __hsail_rdimagef_2dadepth_s32, float, int)
+DefReadImage2DArrayDepth(read_imagef, __hsail_rdimagef_2dadepth_f32, float, float)
+
+
+// Sampler-less Read Image 2d depth
+#define DefLoadImage2DDepth(Func, HsailIntrin, DstTy, CoordTy)          \
+  float                                                                 \
+   HsailIntrin(image2d_depth_t, CoordTy, CoordTy);                      \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy                    \
+  Func(image2d_depth_t image, CoordTy##2 coords) {                      \
+    float                                                               \
+      data = HsailIntrin(image, coords.x, coords.y);                    \
+    return data;                                                        \
+}
+
+DefLoadImage2DDepth(read_imagef, __hsail_ldimagef_2ddepth_u32, float, int)
+
+// Sampler-less Read Image 2d array depth
+
+#define DefLoadImage2DArrayDepth(Func, HsailIntrin, DstTy, CoordTy)     \
+  float                                                                 \
+   HsailIntrin(image2d_array_depth_t, CoordTy, CoordTy, CoordTy);       \
+                                                                        \
+  __attribute__((overloadable, always_inline)) DstTy                    \
+  Func(image2d_array_depth_t image, CoordTy##4 coords) {                \
+    float                                                               \
+      data = HsailIntrin(image, coords.x, coords.y, coords.z);          \
+    return data;                                                        \
+}
+
+DefLoadImage2DArrayDepth(read_imagef, __hsail_ldimagef_2dadepth_u32, float, int)
+
+#endif // __OPENCL_C_VERSION__ >= 200

diff --git a/amd-builtins/image/write.cl b/amd-builtins/image/write.cl
new file mode 100644
index 0000000..0bf5abe
--- /dev/null
+++ b/amd-builtins/image/write.cl

@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+// Image write built-ins
+
+// Hsail store image intrinsics
+extern void __hsail_stimagef_1d_i32(float, float, float, float, image1d_t, int);
+extern void __hsail_stimagei_1d_i32(int, int, int, int, image1d_t, int);
+extern void __hsail_stimageui_1d_i32(uint, uint, uint, uint, image1d_t, int);
+
+extern void __hsail_stimagef_1db_i32(float, float, float, float, image1d_buffer_t, int);
+extern void __hsail_stimagei_1db_i32(int, int, int, int, image1d_buffer_t, int);
+extern void __hsail_stimageui_1db_i32(uint, uint, uint, uint, image1d_buffer_t, int);
+
+extern void __hsail_stimagef_1da_i32(float, float, float, float, image1d_array_t, int, int);
+extern void __hsail_stimagei_1da_i32(int, int, int, int, image1d_array_t, int, int);
+extern void __hsail_stimageui_1da_i32(uint, uint, uint, uint, image1d_array_t, int, int);
+
+extern void __hsail_stimagef_2d_i32(float, float, float, float, image2d_t, int, int);
+extern void __hsail_stimagei_2d_i32(int, int, int, int, image2d_t, int, int);
+extern void __hsail_stimageui_2d_i32(uint, uint, uint, uint, image2d_t, int, int);
+
+extern void __hsail_stimagef_2da_i32(float, float, float, float, image2d_array_t, int, int, int, int);
+extern void __hsail_stimagei_2da_i32(int, int, int, int, image2d_array_t, int, int, int, int);
+extern void __hsail_stimageui_2da_i32(uint, uint, uint, uint, image2d_array_t, int, int, int, int);
+
+extern void __hsail_stimagef_3d_i32(float, float, float, float, image3d_t, int, int, int, int);
+extern void __hsail_stimagei_3d_i32(int, int, int, int, image3d_t, int, int, int, int);
+extern void __hsail_stimageui_3d_i32(uint, uint, uint, uint, image3d_t, int, int, int, int);
+
+
+#define DefWriteImage1d(Func,HsailIntrin,CoordTy,ValTy) \
+__attribute__((overloadable, always_inline)) void \
+Func(image1d_t image, CoordTy coords, ValTy##4 val) { \
+  HsailIntrin(val.x, val.y, val.z, val.w, image, coords); \
+}
+
+#define DefWriteImage1dBuffer(Func,HsailIntrin,CoordTy,ValTy) \
+__attribute__((overloadable, always_inline)) void \
+Func(image1d_buffer_t image, CoordTy coords, ValTy##4 val) { \
+  HsailIntrin(val.x, val.y, val.z, val.w, image, coords); \
+}
+
+#define DefWriteImage1dArray(Func,HsailIntrin,CoordTy,ValTy) \
+__attribute__((overloadable, always_inline)) void \
+  Func(image1d_array_t image, CoordTy##2 coords, ValTy##4 val) { \
+  HsailIntrin(val.x, val.y, val.z, val.w, image, coords.x, coords.y); \
+}
+
+#define DefWriteImage2d(Func,HsailIntrin,CoordTy,ValTy) \
+__attribute__((overloadable, always_inline)) void \
+Func(image2d_t image, CoordTy##2 coords, ValTy##4 val) { \
+  HsailIntrin(val.x, val.y, val.z, val.w, image, coords.x, coords.y); \
+}
+
+#define DefWriteImage2dArray(Func,HsailIntrin,CoordTy,ValTy) \
+__attribute__((overloadable, always_inline)) void \
+Func(image2d_array_t image, CoordTy##4 coords, ValTy##4 val) { \
+  HsailIntrin(val.x, val.y, val.z, val.w, image, coords.x, coords.y, coords.z, coords.w); \
+}
+
+#define DefWriteImage3d(Func,HsailIntrin,CoordTy,ValTy) \
+__attribute__((overloadable, always_inline)) void \
+Func(image3d_t image, CoordTy##4 coords, ValTy##4 val) { \
+  HsailIntrin(val.x, val.y, val.z, val.w, image, coords.x, coords.y, coords.z, coords.w); \
+}
+
+// Write Image 1d
+DefWriteImage1d(write_imagef, __hsail_stimagef_1d_i32, int, float)
+DefWriteImage1d(write_imagei, __hsail_stimagei_1d_i32, int, int)
+DefWriteImage1d(write_imageui, __hsail_stimageui_1d_i32, int, uint)
+
+// Write Image 1d Array
+DefWriteImage1dArray(write_imagef, __hsail_stimagef_1da_i32, int, float)
+DefWriteImage1dArray(write_imagei, __hsail_stimagei_1da_i32, int, int)
+DefWriteImage1dArray(write_imageui, __hsail_stimageui_1da_i32, int, uint)
+
+// Write Image 1d Buffer
+DefWriteImage1dBuffer(write_imagef, __hsail_stimagef_1db_i32, int, float)
+DefWriteImage1dBuffer(write_imagei, __hsail_stimagei_1db_i32, int, int)
+DefWriteImage1dBuffer(write_imageui, __hsail_stimageui_1db_i32, int, uint)
+
+// Write Image 2d
+DefWriteImage2d(write_imagef, __hsail_stimagef_2d_i32, int, float)
+DefWriteImage2d(write_imagei, __hsail_stimagei_2d_i32, int, int)
+DefWriteImage2d(write_imageui, __hsail_stimageui_2d_i32, int, uint)
+
+// Write Image 2d Array
+DefWriteImage2dArray(write_imagef, __hsail_stimagef_2da_i32, int, float)
+DefWriteImage2dArray(write_imagei, __hsail_stimagei_2da_i32, int, int)
+DefWriteImage2dArray(write_imageui, __hsail_stimageui_2da_i32, int, uint)
+
+// Write Image 3d
+DefWriteImage3d(write_imagef, __hsail_stimagef_3d_i32, int, float)
+DefWriteImage3d(write_imagei, __hsail_stimagei_3d_i32, int, int)
+DefWriteImage3d(write_imageui, __hsail_stimageui_3d_i32, int, uint)
+
+#ifdef __clang__
+// Image-2.0 write built-ins
+
+// Hsail store image intrinsics
+extern void __hsail_stimagef_2ddepth_i32(float, image2d_depth_t, int, int);
+extern void __hsail_stimagef_2dadepth_i32(float, image2d_array_depth_t, int, int, int, int);
+
+#define DefWriteImage2dDepth(Func, HsailIntrin, CoordTy, ValTy) \
+__attribute__((overloadable, always_inline)) void \
+Func(image2d_depth_t image, CoordTy##2 coords, ValTy val) { \
+  HsailIntrin(val, image, coords.x, coords.y); \
+}
+
+#define DefWriteImage2dArrayDepth(Func, HsailIntrin, CoordTy, ValTy) \
+__attribute__((overloadable, always_inline)) void \
+Func(image2d_array_depth_t image, CoordTy##4 coords, ValTy val) { \
+  HsailIntrin(val, image, coords.x, coords.y, coords.z, coords.w); \
+}
+
+// Write Image 2d Depth
+DefWriteImage2dDepth(write_imagef, __hsail_stimagef_2ddepth_i32, int, float)
+
+// Write Image 2d Array Depth
+DefWriteImage2dArrayDepth(write_imagef, __hsail_stimagef_2dadepth_i32, int, float)
+
+#endif
\ No newline at end of file

diff --git a/amd-builtins/int/abs_base.cl b/amd-builtins/int/abs_base.cl
new file mode 100644
index 0000000..d0c2317
--- /dev/null
+++ b/amd-builtins/int/abs_base.cl

@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "ibuiltins.h"
+
+// ----- char -----
+
+__attribute__((overloadable, always_inline)) uchar
+abs(char x)
+{
+    char s = x >> 7;
+    return (uchar)((x + s) ^ s);
+}
+
+
+// ----- uchar -----
+
+__attribute__((overloadable, always_inline)) uchar
+abs(uchar x)
+{
+    return x;
+}
+
+// ----- short -----
+
+__attribute__((overloadable, always_inline)) ushort
+abs(short x)
+{
+    short s = x >> 15;
+    return (ushort)((x + s) ^ s);
+}
+
+// ----- ushort -----
+
+__attribute__((overloadable, always_inline)) ushort
+abs(ushort x)
+{
+    return x;
+}
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) uint
+abs(int x)
+{
+    int s = x >> 31;
+    return (uint)((x + s) ^ s);
+}
+
+// ----- uint -----
+
+__attribute__((overloadable, always_inline)) uint
+abs(uint x)
+{
+    return x;
+}
+
+// ----- long -----
+
+__attribute__((overloadable, always_inline)) ulong
+abs(long x)
+{
+    long s = x >> 63;
+    return (ulong)((x + s) ^ s);
+}
+
+// ----- ulong -----
+
+__attribute__((overloadable, always_inline)) ulong
+abs(ulong x)
+{
+    return x;
+}
+

diff --git a/amd-builtins/int/abs_diff_base.cl b/amd-builtins/int/abs_diff_base.cl
new file mode 100644
index 0000000..0cf2007
--- /dev/null
+++ b/amd-builtins/int/abs_diff_base.cl

@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// ----- char -----
+
+__attribute__((overloadable, always_inline)) uchar
+abs_diff(char x, char y)
+{
+    int ix = x;
+    int iy = y;
+    int d = max(ix,iy) - min(ix,iy);
+    return (uchar)d;
+}
+
+// ----- uchar -----
+
+__attribute__((overloadable, always_inline)) uchar
+abs_diff(uchar x, uchar y)
+{
+    uint ux = x;
+    uint uy = y;
+    uint d = max(ux,uy) - min(ux,uy);
+    return (uchar)d;
+}
+
+// ----- short -----
+
+__attribute__((overloadable, always_inline)) ushort
+abs_diff(short x, short y)
+{
+    int ix = x;
+    int iy = y;
+    int d = max(ix,iy) - min(ix,iy);
+    return (ushort)d;
+}
+
+// ----- ushort -----
+
+__attribute__((overloadable, always_inline)) ushort
+abs_diff(ushort x, ushort y)
+{
+    uint ux = x;
+    uint uy = y;
+    uint d = max(ux,uy) - min(ux,uy);
+    return (ushort)d;
+}
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) uint
+abs_diff(int x, int y)
+{
+    return (uint)(max(x,y) - min(x,y));
+}
+
+// ----- uint -----
+
+__attribute__((overloadable, always_inline)) uint
+abs_diff(uint x, uint y)
+{
+    return max(x,y) - min(x,y);
+}
+
+// ----- long -----
+
+__attribute__((overloadable, always_inline)) ulong
+abs_diff(long x, long y)
+{
+    long xmy = x - y;
+    long ymx = y - x;
+    return (ulong)(x > y ? xmy : ymx);
+}
+
+// ----- ulong -----
+
+__attribute__((overloadable, always_inline)) ulong
+abs_diff(ulong x, ulong y)
+{
+    ulong xmy = x - y;
+    ulong ymx = y - x;
+    return x > y ? xmy : ymx;
+}
+

diff --git a/amd-builtins/int/add_sat_base.cl b/amd-builtins/int/add_sat_base.cl
new file mode 100644
index 0000000..7adc9f8
--- /dev/null
+++ b/amd-builtins/int/add_sat_base.cl

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// ----- char -----
+
+__attribute__((overloadable, always_inline)) char
+add_sat(char x, char y)
+{
+    int s = (int)x + (int) y;
+    return max(-128, min(127, s));
+}
+
+// ----- uchar -----
+
+__attribute__((overloadable, always_inline)) uchar
+add_sat(uchar x, uchar y)
+{
+    uint s = (uint)x + (uint)y;
+    return min(255U, s);
+}
+
+
+// ----- short -----
+
+__attribute__((overloadable, always_inline)) short
+add_sat(short x, short y)
+{
+    int s = (int)x + (int) y;
+    return max(-32768, min(32767, s));
+}
+
+// ----- ushort -----
+
+__attribute__((overloadable, always_inline)) ushort
+add_sat(ushort x, ushort y)
+{
+    uint s = (uint)x + (uint)y;
+    return min(65535U, s);
+}
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) int
+add_sat(int x, int y)
+{
+    int s = x + y;
+    s = y < 1 & (int)0x80000000 - y > x ? (int)0x80000000 : s;
+    s = y > 0 & 0x7fffffff - y < x ? 0x7fffffff : s;
+    return s;
+}
+
+// ----- uint -----
+
+__attribute__((overloadable, always_inline)) uint
+add_sat(uint x, uint y)
+{
+    uint s = x + y;
+    s = 0xffffffffU - y < x ? 0xffffffffU : s;
+    return s;
+}
+
+// ----- long -----
+
+__attribute__((overloadable, always_inline)) long
+add_sat(long x, long y)
+{
+    long s = x + y;
+    s = y < 1 & (long)0x8000000000000000 - y > x ? (long)0x8000000000000000 : s;
+    s = y > 0 & 0x7fffffffffffffffL - y < x ? 0x7fffffffffffffffL : s;
+    return s;
+}
+
+// ----- ulong -----
+
+__attribute__((overloadable, always_inline)) ulong
+add_sat(ulong x, ulong y)
+{
+    ulong s = x + y;
+    s = 0xffffffffffffffffUL - y < x ? 0xffffffffffffffffUL : s;
+    return s;
+}
+

diff --git a/amd-builtins/int/clz_base.cl b/amd-builtins/int/clz_base.cl
new file mode 100644
index 0000000..21f2c5f
--- /dev/null
+++ b/amd-builtins/int/clz_base.cl

@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "ibuiltins.h"
+
+__attribute__((always_inline)) static uint
+myclz4(uint x)
+{
+    uint z =  __hsail_firstbit_u32(x);
+    return x == 0U ? 32U : z;
+}
+
+// ----- [u]char -----
+
+__attribute__((overloadable, always_inline)) char
+clz(char x)
+{
+    return myclz4((uint)x & 0xffU) - 24U;
+}
+
+__attribute__((overloadable, always_inline)) uchar
+clz(uchar x)
+{
+    return myclz4((uint)x) - 24U;
+}
+
+// ----- [u]short -----
+
+__attribute__((overloadable, always_inline)) short
+clz(short x)
+{
+    return myclz4((uint)x & 0xffffU) - 16U;
+}
+
+__attribute__((overloadable, always_inline)) ushort
+clz(ushort x)
+{
+    return myclz4((uint)x) - 16U;
+}
+
+// ----- [u]int -----
+
+extern __attribute__((overloadable, alias("myclz4"))) uint clz(uint);
+extern __attribute__((overloadable, alias("myclz4"))) int clz(int);
+
+// ----- [u]long -----
+
+__attribute__((always_inline)) static ulong
+myclz8(ulong x)
+{
+    uint xlo = (uint)x;
+    uint xhi = (uint)(x >> 32);
+    uint zlo = __hsail_firstbit_u32(xlo);
+    uint zhi = __hsail_firstbit_u32(xhi);
+    uint clo = (xlo == 0 ? 32 : zlo) + 32;
+    return xhi == 0 ? clo : zhi;
+}
+
+extern __attribute__((overloadable, alias("myclz8"))) ulong clz(ulong);
+extern __attribute__((overloadable, alias("myclz8"))) long clz(long);
+

diff --git a/amd-builtins/int/ctz_base.cl b/amd-builtins/int/ctz_base.cl
new file mode 100644
index 0000000..545ac03
--- /dev/null
+++ b/amd-builtins/int/ctz_base.cl

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "ibuiltins.h"
+
+#if __OPENCL_C_VERSION__ >= 200
+
+// ----- [u]char -----
+
+__attribute__((overloadable, always_inline)) char
+ctz(char x)
+{
+    uint z = __hsail_lastbit_u32((uint)x & 0xffU);
+    return x == 0 ? 8U : z;
+}
+
+__attribute__((overloadable, always_inline)) uchar
+ctz(uchar x)
+{
+    uint z = __hsail_lastbit_u32((uint)x);
+    return x == 0 ? 8U : z;
+}
+
+// ----- [u]short -----
+
+__attribute__((overloadable, always_inline)) short
+ctz(short x)
+{
+    uint z = __hsail_lastbit_u32((uint)x & 0xffffU);
+    return x == 0 ? 16U : z;
+}
+
+__attribute__((overloadable, always_inline)) ushort
+ctz(ushort x)
+{
+    uint z = __hsail_lastbit_u32((uint)x);
+    return x == 0 ? 16U : z;
+}
+
+// ----- [u]int -----
+
+__attribute__((always_inline)) static uint
+myctz4(uint x)
+{
+    uint z = __hsail_lastbit_u32(x);
+    return x == 0U ? 32 : z;
+}
+
+extern __attribute__((overloadable, alias("myctz4"))) uint ctz(uint);
+extern __attribute__((overloadable, alias("myctz4"))) int ctz(int);
+
+// ----- [u]long -----
+
+__attribute__((always_inline)) static ulong
+myctz8(ulong x)
+{
+    uint xhi = x >> 32;
+    uint xlo = (uint)x;
+    uint zhi = __hsail_lastbit_u32(xhi);
+    uint zlo = __hsail_lastbit_u32(xlo);
+    uint chi = (xhi == 0 ? 32 : zhi) + 32;
+    return xlo == 0 ? chi : zlo;
+}
+
+extern __attribute__((overloadable, alias("myctz8"))) ulong ctz(ulong);
+extern __attribute__((overloadable, alias("myctz8"))) long ctz(long);
+
+#endif

diff --git a/amd-builtins/int/hadd_base.cl b/amd-builtins/int/hadd_base.cl
new file mode 100644
index 0000000..b19ec2c
--- /dev/null
+++ b/amd-builtins/int/hadd_base.cl

@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// ----- char -----
+
+__attribute__((overloadable, always_inline)) char
+hadd(char x, char y)
+{
+    // compiler automatically casts larger
+    return (x + y) >> 1;
+}
+
+// ----- uchar -----
+
+__attribute__((overloadable, always_inline)) uchar
+hadd(uchar x, uchar y)
+{
+    // compiler automatically casts larger
+    return (x + y) >> 1;
+}
+
+// ----- short -----
+
+__attribute__((overloadable, always_inline)) short
+hadd(short x, short y)
+{
+    // compiler automatically casts larger
+    return (x + y) >> 1;
+}
+
+// ----- ushort -----
+
+__attribute__((overloadable, always_inline)) ushort
+hadd(ushort x, ushort y)
+{
+    // compiler automatically casts larger
+    return (x + y) >> 1;
+}
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) int
+hadd(int x, int y)
+{
+    int cin = (x & 1) & y;
+    return (x >> 1) + (y >> 1) + cin;
+}
+
+// ----- uint -----
+
+__attribute__((overloadable, always_inline)) uint
+hadd(uint x, uint y)
+{
+    uint cin = (x & 1U) & y;
+    return (x >> 1) + (y >> 1) + cin;
+}
+
+// ----- long -----
+
+__attribute__((overloadable, always_inline)) long
+hadd(long x, long y)
+{
+    long cin = (x & 1) & y;
+    return (x >> 1) + (y >> 1) + cin;
+}
+
+// ----- ulong -----
+
+__attribute__((overloadable, always_inline)) ulong
+hadd(ulong x, ulong y)
+{
+    ulong cin = (x & 1) & y;
+    return (x >> 1) + (y >> 1) + cin;
+}
+

diff --git a/amd-builtins/int/ibuiltins.h b/amd-builtins/int/ibuiltins.h
new file mode 100644
index 0000000..a46ae6f
--- /dev/null
+++ b/amd-builtins/int/ibuiltins.h

@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern __attribute__((pure)) int   __amdil_count_bits_i32(int);
+extern __attribute__((pure)) int   __hsail_firstbit_u32(uint);
+extern __attribute__((pure)) int   __hsail_lastbit_u32(uint);
+
+extern __attribute__((pure)) int   __amdil_imad24_i32(int, int, int);
+extern __attribute__((pure)) uint  __amdil_umad24_u32(uint, uint, uint);
+extern __attribute__((pure)) int   __amdil_imul24_i32(int, int);
+extern __attribute__((pure)) uint  __amdil_umul24_u32(uint, uint);
+
+extern __attribute__((pure)) int   __amdil_imin_i32(int,  int);
+extern __attribute__((pure)) int   __amdil_imax_i32(int,  int);
+extern __attribute__((pure)) uint  __amdil_umin_u32(uint,  uint);
+extern __attribute__((pure)) uint  __amdil_umax_u32(uint,  uint);
+
+extern __attribute__((pure)) long  __amdil_imin_i64(long,  long);
+extern __attribute__((pure)) long  __amdil_imax_i64(long,  long);
+extern __attribute__((pure)) ulong __amdil_umin_u64(ulong,  ulong);
+extern __attribute__((pure)) ulong __amdil_umax_u64(ulong,  ulong);
+
+extern __attribute__((pure)) int   __amdil_imul_high_i32(int,  int);
+extern __attribute__((pure)) uint  __amdil_umul_high_u32(uint,  uint);
+
+static inline long
+_gpu_mul_hi_i64(long x, long y)
+{
+    ulong x0 = (ulong)x & 0xffffffffUL;
+    long x1 = x >> 32;
+    ulong y0 = (ulong)y & 0xffffffffUL;
+    long y1 = y >> 32;
+    ulong z0 = x0*y0;
+    long t = x1*y0 + (z0 >> 32);
+    long z1 = t & 0xffffffffL;
+    long z2 = t >> 32;
+    z1 = x0*y1 + z1;
+    return x1*y1 + z2 + (z1 >> 32);
+}
+
+static inline ulong
+_gpu_mul_hi_u64(ulong x, ulong y)
+{
+    ulong x0 = x & 0xffffffffUL;
+    ulong x1 = x >> 32;
+    ulong y0 = y & 0xffffffffUL;
+    ulong y1 = y >> 32;
+    ulong z0 = x0*y0;
+    ulong t = x1*y0 + (z0 >> 32);
+    ulong z1 = t & 0xffffffffUL;
+    ulong z2 = t >> 32;
+    z1 = x0*y1 + z1;
+    return x1*y1 + z2 + (z1 >> 32);
+}
+

diff --git a/amd-builtins/int/mad_hi_base.cl b/amd-builtins/int/mad_hi_base.cl
new file mode 100644
index 0000000..85e0631
--- /dev/null
+++ b/amd-builtins/int/mad_hi_base.cl

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "ibuiltins.h"
+
+// ----- char -----
+
+__attribute__((overloadable, always_inline)) char
+mad_hi(char a, char b, char c)
+{
+    return (char)(((int)a * (int)b) >> 8) + c;
+}
+
+// ----- uchar -----
+
+__attribute__((overloadable, always_inline)) uchar
+mad_hi(uchar a, uchar b, uchar c)
+{
+    return (uchar)(((uint)a * (uint)b) >> 8) + c;
+}
+
+// ----- short -----
+
+__attribute__((overloadable, always_inline)) short
+mad_hi(short a, short b, short c)
+{
+    return (short)(((int)a * (int)b) >> 16) + c;
+}
+
+// ----- ushort -----
+
+__attribute__((overloadable, always_inline)) ushort
+mad_hi(ushort a, ushort b, ushort c)
+{
+    return (ushort)(((uint)a * (uint)b) >> 16) + c;
+}
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) int
+mad_hi(int a, int b, int c)
+{
+    return (int)(((long)a * (long)b) >> 32) + c;
+}
+
+// ----- uint -----
+
+__attribute__((overloadable, always_inline)) uint
+mad_hi(uint a, uint b, uint c)
+{
+    return (uint)(((ulong)a * (ulong)b) >> 32) + c;
+}
+
+
+// ----- long -----
+
+__attribute__((overloadable, always_inline)) long
+mad_hi(long a, long b, long c)
+{
+    return _gpu_mul_hi_i64(a, b) + c;
+}
+
+// ----- ulong -----
+
+__attribute__((overloadable, always_inline)) ulong
+mad_hi(ulong a, ulong b, ulong c)
+{
+    return _gpu_mul_hi_u64(a, b) + c;
+}
+

diff --git a/amd-builtins/int/mad_sat_base.cl b/amd-builtins/int/mad_sat_base.cl
new file mode 100644
index 0000000..cc9270c
--- /dev/null
+++ b/amd-builtins/int/mad_sat_base.cl

@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "ibuiltins.h"
+
+// ----- char -----
+
+__attribute__((overloadable, always_inline)) char
+mad_sat(char a, char b, char c)
+{
+    int s = (int)a * (int)b + (int)c;
+    return min(127, max(-128, s));
+}
+
+// ----- uchar -----
+
+__attribute__((overloadable, always_inline)) uchar
+mad_sat(uchar a, uchar b, uchar c)
+{
+    uint s = (uint)a * (uint)b + (uint)c;
+    return min(255U, s);
+}
+
+// ----- short -----
+
+__attribute__((overloadable, always_inline)) short
+mad_sat(short a, short b, short c)
+{
+    int s = (int)a * (int)b + (int)c;
+    return min(32767, max(-32768, s));
+}
+
+// ----- ushort -----
+
+__attribute__((overloadable, always_inline)) ushort
+mad_sat(ushort a, ushort b, ushort c)
+{
+    uint s = (uint)a * (uint)b + (uint)c;
+    return min(65535U, s);
+}
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) int
+mad_sat(int a, int b, int c)
+{
+    int lo = a * b;
+    int hi = __amdil_imul_high_i32(a, b);
+    int t = lo + c;
+    hi += c > 0 & 0x7fffffff - c < lo;
+    hi -= c < 1 & (int)0x80000000 - c > lo;
+    lo = t;
+
+    lo = hi < 0 & (hi != -1 | lo >= 0) ? 0x80000000 : lo;
+    lo = hi >= 0 & (hi > 0 | lo < 0) ? 0x7fffffff : lo;
+    return lo;
+}
+
+// ----- uint -----
+__attribute__((overloadable, always_inline)) uint
+mad_sat(uint a, uint b, uint c)
+{
+    uint lo = a * b;
+    uint hi = __amdil_umul_high_u32(a, b);
+    uint t = lo + c;
+    hi += 0xffffffff - c < lo;
+    lo = t;
+    return hi > 0U ? 0xffffffff : lo;
+}
+
+// ----- long -----
+
+__attribute__((overloadable, always_inline)) long
+mad_sat(long a, long b, long c)
+{
+    ulong a0 = (ulong)a & 0xffffffffUL;
+    long a1 = a >> 32;
+    ulong b0 = (ulong)b & 0xffffffffUL;
+    long b1 = b >> 32;
+    ulong s0 = a0*b0;
+    long t = a1*b0 + (s0 >> 32);
+    long s1 = t & 0xffffffffL;
+    long s2 = t >> 32;
+    s1 = a0*b1 + s1;
+    long lo = (s1 << 32) | (s0 & 0xffffffffL);
+    long hi = a1*b1 + s2 + (s1 >> 32);
+
+    t = lo + c;
+    hi += c > 0L & 0x7fffffffffffffffL - c < lo;
+    hi -= c < 1L & (long)0x8000000000000000L - c > lo;
+    lo = t;
+
+    lo = hi < 0L & (hi != -1L | lo >= 0L) ? 0x8000000000000000L : lo;
+    lo = hi >= 0L & (hi > 0L | lo < 0L) ? 0x7fffffffffffffffL : lo;
+
+    return lo;
+}
+
+// ----- ulong -----
+
+__attribute__((overloadable, always_inline)) ulong
+mad_sat(ulong a, ulong b, ulong c)
+{
+    ulong a0 = a & 0xffffffffUL;
+    ulong a1 = a >> 32;
+    ulong b0 = b & 0xffffffffUL;
+    ulong b1 = b >> 32;
+    ulong s0 = a0*b0;
+    ulong t = a1*b0 + (s0 >> 32);
+    ulong s1 = t & 0xffffffffUL;
+    ulong s2 = t >> 32;
+    s1 = a0*b1 + s1;
+    ulong lo = (s1 << 32) | (s0 & 0xffffffffUL);
+    ulong hi = a1*b1 + s2 + (s1 >> 32);
+
+    t = lo + c;
+    hi += 0xffffffffffffffffUL - c < lo;
+    lo = t;
+
+    return hi > 0UL ? 0xffffffffffffffffUL : lo;
+}
+

diff --git a/amd-builtins/int/max_base.cl b/amd-builtins/int/max_base.cl
new file mode 100644
index 0000000..2048256
--- /dev/null
+++ b/amd-builtins/int/max_base.cl

@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "ibuiltins.h"
+
+// ----- char -----
+
+__attribute__((overloadable, always_inline)) char
+max(char x, char y)
+{
+    return __amdil_imax_i32(x, y);
+}
+
+// ----- uchar -----
+
+__attribute__((overloadable, always_inline)) uchar
+max(uchar x, uchar y)
+{
+    return __amdil_umax_u32(x, y);
+}
+
+// ----- short -----
+
+__attribute__((overloadable, always_inline)) short
+max(short x, short y)
+{
+    return __amdil_imax_i32(x, y);
+}
+
+// ----- ushort -----
+
+__attribute__((overloadable, always_inline)) ushort
+max(ushort x, ushort y)
+{
+    return __amdil_umax_u32(x, y);
+}
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) int
+max(int x, int y)
+{
+    return __amdil_imax_i32(x, y);
+}
+
+// ----- uint -----
+
+__attribute__((overloadable, always_inline)) uint
+max(uint x, uint y)
+{
+    return __amdil_umax_u32(x, y);
+}
+
+// ----- long -----
+
+// __hsail_ intrinsics which has no __amdil_ equivalents.
+extern __attribute__((pure)) long  __hsail_max_s64(long,  long);
+extern __attribute__((pure)) ulong  __hsail_max_u64(ulong,  ulong);
+
+__attribute__((overloadable, always_inline)) long
+max(long x, long y)
+{
+    return __hsail_max_s64(x, y);
+}
+
+// ----- ulong -----
+
+__attribute__((overloadable, always_inline)) ulong
+max(ulong x, ulong y)
+{
+    return __hsail_max_u64(x, y);
+}
+

diff --git a/amd-builtins/int/min_base.cl b/amd-builtins/int/min_base.cl
new file mode 100644
index 0000000..798ad57
--- /dev/null
+++ b/amd-builtins/int/min_base.cl

@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "ibuiltins.h"
+
+// ----- char -----
+
+__attribute__((overloadable, always_inline)) char
+min(char x, char y)
+{
+    return __amdil_imin_i32(x, y);
+}
+
+// ----- uchar -----
+
+__attribute__((overloadable, always_inline)) uchar
+min(uchar x, uchar y)
+{
+    return __amdil_umin_u32(x, y);
+}
+
+// ----- short -----
+
+__attribute__((overloadable, always_inline)) short
+min(short x, short y)
+{
+    return __amdil_imin_i32(x, y);
+}
+
+// ----- ushort -----
+
+__attribute__((overloadable, always_inline)) ushort
+min(ushort x, ushort y)
+{
+    return __amdil_umin_u32(x, y);
+}
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) int
+min(int x, int y)
+{
+    return __amdil_imin_i32(x, y);
+}
+
+// ----- uint -----
+
+__attribute__((overloadable, always_inline)) uint
+min(uint x, uint y)
+{
+    return __amdil_umin_u32(x, y);
+}
+
+// ----- long -----
+
+// __hsail_ intrinsics which has no __amdil_ equivalents.
+extern __attribute__((pure)) long  __hsail_min_s64(long,  long);
+extern __attribute__((pure)) ulong  __hsail_min_u64(ulong,  ulong);
+
+__attribute__((overloadable, always_inline)) long
+min(long x, long y)
+{
+    return __hsail_min_s64(x, y);
+}
+
+// ----- ulong -----
+
+__attribute__((overloadable, always_inline)) ulong
+min(ulong x, ulong y)
+{
+    return __hsail_min_u64(x, y);
+}
+

diff --git a/amd-builtins/int/mul24_base.cl b/amd-builtins/int/mul24_base.cl
new file mode 100644
index 0000000..0184c65
--- /dev/null
+++ b/amd-builtins/int/mul24_base.cl

@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "ibuiltins.h"
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) int
+mul24(int x, int y)
+{
+    return __amdil_imul24_i32(x, y);
+}
+
+__attribute__((overloadable, always_inline)) int
+mad24(int a, int b, int c)
+{
+    return __amdil_imad24_i32(a, b, c);
+}
+
+// ----- uint -----
+
+__attribute__((overloadable, always_inline)) uint
+mul24(uint x, uint y)
+{
+    return __amdil_umul24_u32(x, y);
+}
+
+__attribute__((overloadable, always_inline)) uint
+mad24(uint a, uint b, uint c)
+{
+    return __amdil_umad24_u32(a, b, c);
+}

diff --git a/amd-builtins/int/mul_hi_base.cl b/amd-builtins/int/mul_hi_base.cl
new file mode 100644
index 0000000..63d7cc6
--- /dev/null
+++ b/amd-builtins/int/mul_hi_base.cl

@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "ibuiltins.h"
+
+// ----- char -----
+
+__attribute__((overloadable, always_inline)) char
+mul_hi(char x, char y)
+{
+    return (char)(((int)x * (int)y) >> 8);
+}
+
+// ----- uchar -----
+
+__attribute__((overloadable, always_inline)) uchar
+mul_hi(uchar x, uchar y)
+{
+    return (uchar)(((uint)x * (uint)y) >> 8);
+}
+
+// ----- short -----
+
+__attribute__((overloadable, always_inline)) short
+mul_hi(short x, short y)
+{
+    return (short)(((int)x * (int)y) >> 16);
+}
+
+// ----- ushort -----
+
+__attribute__((overloadable, always_inline)) ushort
+mul_hi(ushort x, ushort y)
+{
+    return (ushort)(((uint)x * (uint)y) >> 16);
+}
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) int
+mul_hi(int x, int y)
+{
+    return __amdil_imul_high_i32(x, y);
+}
+
+// ----- uint -----
+
+__attribute__((overloadable, always_inline)) uint
+mul_hi(uint x, uint y)
+{
+    return __amdil_umul_high_u32(x, y);
+}
+
+extern __attribute__((pure)) long  __hsail_mulhi_s64(long, long);
+extern __attribute__((pure)) ulong __hsail_mulhi_u64(ulong, ulong);
+
+// ----- long -----
+
+__attribute__((overloadable, always_inline)) long
+mul_hi(long x, long y)
+{
+    return __hsail_mulhi_s64(x, y);
+}
+
+// ----- ulong -----
+
+__attribute__((overloadable, always_inline)) ulong
+mul_hi(ulong x, ulong y)
+{
+    return __hsail_mulhi_u64(x, y);
+}
+

diff --git a/amd-builtins/int/popcnt_base.cl b/amd-builtins/int/popcnt_base.cl
new file mode 100644
index 0000000..79d0720
--- /dev/null
+++ b/amd-builtins/int/popcnt_base.cl

@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "ibuiltins.h"
+
+#ifdef USE_POPCNT
+#pragma OPENCL EXTENSION cl_amd_popcnt : enable
+#endif
+
+// ----- [u]char -----
+
+__attribute__((always_inline)) static char
+__POPCI1(char x)
+{
+    return __amdil_count_bits_i32((int)x & 0xff);
+}
+
+#ifdef USE_POPCNT
+extern __attribute__((overloadable, alias("__POPCI1"))) char popcnt(char);
+#endif
+
+extern __attribute__((overloadable, alias("__POPCI1"))) char popcount(char);
+
+__attribute__((always_inline)) static uchar
+__POPCU1(uchar x)
+{
+    return __amdil_count_bits_i32((int)x);
+}
+
+#ifdef USE_POPCNT
+extern __attribute__((overloadable, alias("__POPCU1"))) uchar popcnt(uchar);
+#endif
+
+extern __attribute__((overloadable, alias("__POPCU1"))) uchar popcount(uchar);
+
+
+// ----- [u]short -----
+
+__attribute__((always_inline)) static short
+__POPCI2(short x)
+{
+    return __amdil_count_bits_i32((int)x & 0xffff);
+}
+
+#ifdef USE_POPCNT
+extern __attribute__((overloadable, alias("__POPCI2"))) short popcnt(short);
+#endif
+
+extern __attribute__((overloadable, alias("__POPCI2"))) short popcount(short);
+
+__attribute__((always_inline)) static ushort
+__POPCU2(ushort x)
+{
+    return __amdil_count_bits_i32((int)x);
+}
+
+#ifdef USE_POPCNT
+extern __attribute__((overloadable, alias("__POPCU2"))) ushort popcnt(ushort);
+#endif
+
+extern __attribute__((overloadable, alias("__POPCU2"))) ushort popcount(ushort);
+
+
+// ----- [u]int -----
+
+__attribute__((always_inline)) static int
+__POPCI4(int x)
+{
+    return __amdil_count_bits_i32(x);
+}
+
+#ifdef USE_POPCNT
+extern __attribute__((overloadable, alias("__POPCI4"))) int popcnt(int);
+extern __attribute__((overloadable, alias("__POPCI4"))) uint popcnt(uint);
+#endif
+
+extern __attribute__((overloadable, alias("__POPCI4"))) int popcount(int);
+extern __attribute__((overloadable, alias("__POPCI4"))) uint popcount(uint);
+
+// ----- [u]long -----
+
+__attribute__((always_inline)) static long
+__POPCI8(long x)
+{
+    int chi = __amdil_count_bits_i32((int)(x >> 32));
+    int clo = __amdil_count_bits_i32((int)(x & 0xffffffffL));
+    return chi + clo;
+}
+
+#ifdef USE_POPCNT
+extern __attribute__((overloadable, alias("__POPCI8"))) long popcnt(long);
+extern __attribute__((overloadable, alias("__POPCI8"))) ulong popcnt(ulong);
+#endif
+
+extern __attribute__((overloadable, alias("__POPCI8"))) long popcount(long);
+extern __attribute__((overloadable, alias("__POPCI8"))) ulong popcount(ulong);
+

diff --git a/amd-builtins/int/rhadd_base.cl b/amd-builtins/int/rhadd_base.cl
new file mode 100644
index 0000000..e33168f
--- /dev/null
+++ b/amd-builtins/int/rhadd_base.cl

@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+// ----- char -----
+
+__attribute__((overloadable, always_inline)) char
+rhadd(char x, char y)
+{
+    // compiler automatically casts larger
+    return (x + y + 1) >> 1;
+}
+
+// ----- uchar -----
+
+__attribute__((overloadable, always_inline)) uchar
+rhadd(uchar x, uchar y)
+{
+    // compiler automatically casts larger
+    return (x + y + 1U) >> 1;
+}
+
+// ----- short -----
+
+__attribute__((overloadable, always_inline)) short
+rhadd(short x, short y)
+{
+    // compiler automatically casts larger
+    return (x + y + 1) >> 1;
+}
+
+// ----- ushort -----
+
+__attribute__((overloadable, always_inline)) ushort
+rhadd(ushort x, ushort y)
+{
+    // compiler automatically casts larger
+    return (x + y + 1U) >> 1;
+}
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) int
+rhadd(int x, int y)
+{
+    int cin = (x | y) & 1;
+    return (x >> 1) + (y >> 1) + cin;
+}
+
+// ----- uint -----
+__attribute__((overloadable, always_inline)) uint
+rhadd(uint x, uint y)
+{
+    uint cin = (x | y) & 1;
+    return (x >> 1) + (y >> 1) + cin;
+}
+
+// ----- long -----
+__attribute__((overloadable, always_inline)) long
+rhadd(long x, long y)
+{
+    long cin = (x | y) & 1;
+    return (x >> 1) + (y >> 1) + cin;
+}
+
+// ----- ulong -----
+
+__attribute__((overloadable, always_inline)) ulong
+rhadd(ulong x, ulong y)
+{
+    ulong cin = (x | y) & 1;
+    return (x >> 1) + (y >> 1) + cin;
+}
+

diff --git a/amd-builtins/int/rotate_base.cl b/amd-builtins/int/rotate_base.cl
new file mode 100644
index 0000000..a0ecf78
--- /dev/null
+++ b/amd-builtins/int/rotate_base.cl

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "ibuiltins.h"
+
+// ----- [u]char -----
+
+__attribute__((always_inline)) static uchar
+__ROTI1(uchar x, uchar y) 
+{
+    y &= 0x7;
+    return (x << y) | (x >> (8-y));
+}
+
+extern __attribute__((overloadable, alias("__ROTI1"))) uchar rotate(uchar, uchar);
+extern __attribute__((overloadable, alias("__ROTI1"))) char rotate(char, char);
+
+// ----- [u]short -----
+
+__attribute__((always_inline)) static ushort
+__ROTI2(ushort x, ushort y) 
+{
+    y &= 0xf;
+    return (x << y) | (x >> (16-y));
+}
+
+extern __attribute__((overloadable, alias("__ROTI2"))) ushort rotate(ushort, ushort);
+extern __attribute__((overloadable, alias("__ROTI2"))) short rotate(short, short);
+
+// ----- [u]int -----
+extern __attribute__((const)) uint __hsail_bitalign_b32(uint, uint, uint);
+
+__attribute__((always_inline)) static uint
+__ROTI4(uint x, uint y) 
+{
+    return __hsail_bitalign_b32(x, x, (-y) & 0x1f);
+}
+
+extern __attribute__((overloadable, alias("__ROTI4"))) uint rotate(uint, uint);
+extern __attribute__((overloadable, alias("__ROTI4"))) int rotate(int, int);
+
+// ----- [u]long -----
+
+__attribute__((always_inline)) static ulong
+__ROTI8(ulong x, ulong y) 
+{
+    y &= 0x3f;
+    return (x << y) | (x >> (64-y));
+}
+
+extern __attribute__((overloadable, alias("__ROTI8"))) ulong rotate(ulong, ulong);
+extern __attribute__((overloadable, alias("__ROTI8"))) long rotate(long, long);
+

diff --git a/amd-builtins/int/sub_sat_base.cl b/amd-builtins/int/sub_sat_base.cl
new file mode 100644
index 0000000..cbca16c
--- /dev/null
+++ b/amd-builtins/int/sub_sat_base.cl

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// ----- char -----
+
+__attribute__((overloadable, always_inline)) char
+sub_sat(char x, char y)
+{
+    int s = (int)x - (int) y;
+    return max(-128, min(127, s));
+}
+
+// ----- uchar -----
+
+__attribute__((overloadable, always_inline)) uchar
+sub_sat(uchar x, uchar y)
+{
+    int s = (int)x - (int)y;
+    return (uchar)max(s, 0);
+}
+
+// ----- short -----
+
+__attribute__((overloadable, always_inline)) short
+sub_sat(short x, short y)
+{
+    int s = (int)x - (int) y;
+    return max(-32768, min(32767, s));
+}
+
+// ----- ushort -----
+
+__attribute__((overloadable, always_inline)) ushort
+sub_sat(ushort x, ushort y)
+{
+    int s = (int)x - (int)y;
+    return (ushort)max(s, 0);
+}
+
+// ----- int -----
+
+__attribute__((overloadable, always_inline)) int
+sub_sat(int x, int y)
+{
+    int s = x - y;
+    s = y < 1 & 0x7fffffff + y < x ? 0x7fffffff : s;
+    s = y > 0 & (int)0x80000000 + y > x ? (int)0x80000000 : s;
+    return s;
+}
+
+// ----- uint -----
+
+__attribute__((overloadable, always_inline)) uint
+sub_sat(uint x, uint y)
+{
+    uint s = x - y;
+    return y > x ? 0U : s;
+}
+
+// ----- long -----
+
+__attribute__((overloadable, always_inline)) long
+sub_sat(long x, long y)
+{
+    long s = x - y;
+    s = y < 1 & 0x7fffffffffffffffL + y < x ? 0x7fffffffffffffffL : s;
+    s = y > 0 & (long)0x8000000000000000L + y > x ? (long)0x8000000000000000L : s;
+    return s;
+}
+
+// ----- ulong -----
+
+__attribute__((overloadable, always_inline)) ulong
+sub_sat(ulong x, ulong y)
+{
+    ulong s = x - y;
+    return y > x ? 0UL : s;
+}
+

diff --git a/amd-builtins/int/upsample_base.cl b/amd-builtins/int/upsample_base.cl
new file mode 100644
index 0000000..7cb0e60
--- /dev/null
+++ b/amd-builtins/int/upsample_base.cl

@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "ibuiltins.h"
+
+// ----- (u)char -----
+
+__attribute__((overloadable, always_inline)) ushort
+upsample(uchar hi, uchar lo)
+{
+    return ((ushort)hi << 8) | lo;
+}
+
+__attribute__((overloadable, always_inline)) short
+upsample(char hi, uchar lo)
+{
+    return ((short)hi << 8) | lo;
+}
+
+// ----- (u)short -----
+
+__attribute__((overloadable, always_inline)) uint
+upsample(ushort hi, ushort lo)
+{
+    return ((uint)hi << 16) | lo;
+}
+
+__attribute__((overloadable, always_inline)) int
+upsample(short hi, ushort lo)
+{
+    return ((int)hi << 16) | lo;
+}
+
+// ----- (u)int -----
+
+__attribute__((overloadable, always_inline)) ulong
+upsample(uint hi, uint lo)
+{
+    return ((ulong)hi << 32) | lo;
+}
+
+__attribute__((overloadable, always_inline)) long
+upsample(int hi, uint lo)
+{
+    return ((long)hi << 32) | lo;
+}
+

diff --git a/amd-builtins/math32/acosF.cl b/amd-builtins/math32/acosF.cl
new file mode 100644
index 0000000..a55faee
--- /dev/null
+++ b/amd-builtins/math32/acosF.cl

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+acos(float x)
+{
+    // Computes arccos(x).
+    // The argument is first reduced by noting that arccos(x)
+    // is invalid for abs(x) > 1. For denormal and small
+    // arguments arccos(x) = pi/2 to machine accuracy.
+    // Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arccos(x) = pi/2 - arcsin(x)
+    // = pi/2 - (x + x^3*R(x^2))
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+
+    // Some constants and split constants.
+    const float piby2 = 1.5707963705e+00F;
+    const float pi = 3.1415926535897933e+00F;
+    const float piby2_head = 1.5707963267948965580e+00F;
+    const float piby2_tail = 6.12323399573676603587e-17F;
+
+    uint ux = as_uint(x);
+    uint aux = ux & ~SIGNBIT_SP32;
+    int xneg = ux != aux;
+    int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+    float y = as_float(aux);
+
+    // transform if |x| >= 0.5
+    int transform = xexp >= -1;
+
+    float y2 = y * y;
+    float yt = 0.5f * (1.0f - y);
+    float r = transform ? yt : y2;
+
+    // Use a rational approximation for [0.0, 0.5]
+    float a = mad(r,
+                  mad(r,
+                      mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F),
+                      -0.0565298683201845211985026327361F),
+                  0.184161606965100694821398249421F);
+
+    float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F);
+    float u = r * MATH_DIVIDE(a, b);
+
+    float s = MATH_SQRT(r);
+    y = s;
+    float s1 = as_float(as_uint(s) & 0xffff0000);
+    float c = MATH_DIVIDE(mad(s1, -s1, r), s + s1);
+    float rettn = mad(s + mad(y, u, -piby2_tail), -2.0f, pi);
+    float rettp = 2.0F * (s1 + mad(y, u, c));
+    float rett = xneg ? rettn : rettp;
+    float ret = piby2_head - (x - mad(x, -u, piby2_tail));
+
+    ret = transform ? rett : ret;
+    ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret;
+    ret = ux == 0x3f800000U ? 0.0f : ret;
+    ret = ux == 0xbf800000U ? pi : ret;
+    ret = xexp < -26 ? piby2 : ret;
+    return ret;
+}
+

diff --git a/amd-builtins/math32/acoshF.cl b/amd-builtins/math32/acoshF.cl
new file mode 100644
index 0000000..ca8e3c7
--- /dev/null
+++ b/amd-builtins/math32/acoshF.cl

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+acosh(float x)
+{
+    uint ux = as_uint(x);
+
+    // Arguments greater than 1/sqrt(epsilon) in magnitude are
+    // approximated by acosh(x) = ln(2) + ln(x)
+    // For 2.0 <= x <= 1/sqrt(epsilon) the approximation is
+    // acosh(x) = ln(x + sqrt(x*x-1)) */
+    int high = ux > 0x46000000U;
+    int med = ux > 0x40000000U;
+
+    float w = x - 1.0f;
+    float s = w*w + 2.0f*w;
+    float t = x*x - 1.0f;
+    float r = MATH_SQRT(med ? t : s) + (med ? x : w);
+    float v = (high ? x : r) - (med ? 1.0f : 0.0f);
+    float z = log1p(v) + (high ? 0x1.62e430p-1f : 0.0f);
+
+    z = ux >= PINFBITPATT_SP32 ? x : z;
+    z = x < 1.0f ? as_float(QNANBITPATT_SP32) : z;
+
+    return z;
+}
+

diff --git a/amd-builtins/math32/acospiF.cl b/amd-builtins/math32/acospiF.cl
new file mode 100644
index 0000000..128f84b
--- /dev/null
+++ b/amd-builtins/math32/acospiF.cl

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+acospi(float x)
+{
+    // Computes arccos(x).
+    // The argument is first reduced by noting that arccos(x)
+    // is invalid for abs(x) > 1. For denormal and small
+    // arguments arccos(x) = pi/2 to machine accuracy.
+    // Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arccos(x) = pi/2 - arcsin(x)
+    // = pi/2 - (x + x^3*R(x^2))
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+
+    // Some constants and split constants.
+    const float pi = 3.1415926535897933e+00f;
+    const float piby2_head = 1.5707963267948965580e+00f;  /* 0x3ff921fb54442d18 */
+    const float piby2_tail = 6.12323399573676603587e-17f; /* 0x3c91a62633145c07 */
+
+    uint ux = as_uint(x);
+    uint aux = ux & ~SIGNBIT_SP32;
+    int xneg = ux != aux;
+    int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+
+    float y = as_float(aux);
+
+    // transform if |x| >= 0.5
+    int transform = xexp >= -1;
+
+    float y2 = y * y;
+    float yt = 0.5f * (1.0f - y);
+    float r = transform ? yt : y2;
+
+    // Use a rational approximation for [0.0, 0.5]
+    float a = mad(r, mad(r, mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F),
+		                                                         -0.0565298683201845211985026327361F),
+	                                                                  0.184161606965100694821398249421F);
+    float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F);
+    float u = r * MATH_DIVIDE(a, b);
+
+    float s = MATH_SQRT(r);
+    y = s;
+    float s1 = as_float(as_uint(s) & 0xffff0000);
+    float c = MATH_DIVIDE(r - s1 * s1, s + s1);
+    // float rettn = 1.0f - MATH_DIVIDE(2.0f * (s + (y * u - piby2_tail)), pi);
+    float rettn = 1.0f - MATH_DIVIDE(2.0f * (s + mad(y, u, -piby2_tail)), pi);
+    // float rettp = MATH_DIVIDE(2.0F * s1 + (2.0F * c + 2.0F * y * u), pi);
+    float rettp = MATH_DIVIDE(2.0f*(s1 + mad(y, u, c)), pi);
+    float rett = xneg ? rettn : rettp;
+    // float ret = MATH_DIVIDE(piby2_head - (x - (piby2_tail - x * u)), pi);
+    float ret = MATH_DIVIDE(piby2_head - (x - mad(x, -u, piby2_tail)), pi);
+
+    ret = transform ? rett : ret;
+    ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret;
+    ret = ux == 0x3f800000U ? 0.0f : ret;
+    ret = ux == 0xbf800000U ? 1.0f : ret;
+    ret = xexp < -26 ? 0.5f : ret;
+    return ret;
+}
+

diff --git a/amd-builtins/math32/all_native32.cl b/amd-builtins/math32/all_native32.cl
new file mode 100644
index 0000000..6fd1506
--- /dev/null
+++ b/amd-builtins/math32/all_native32.cl

@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+// HSAIL versions of native built-ins
+
+// HSAIL intrinsic functions
+extern __attribute__((pure)) float __hsail_ncos_f32(float);
+extern __attribute__((pure)) float __hsail_nexp2_f32(float);
+extern __attribute__((pure)) float __hsail_nlog2_f32(float);
+extern __attribute__((pure)) float __hsail_nrcp_f32(float);
+extern __attribute__((pure)) float __hsail_nrsqrt_f32(float);
+extern __attribute__((pure)) float __hsail_nsin_f32(float);
+extern __attribute__((pure)) float __hsail_nsqrt_f32(float);
+
+// Value of log2(10)
+#define M_LOG2_10_F  3.32192809488736f
+// Value of 1 / log2(10)
+#define M_RLOG2_10_F 0.30102999566398f
+// Value of 1 / M_LOG2E_F = 1 / log2(e)
+#define M_RLOG2_E_F  0.69314718055995f
+
+
+__attribute__((overloadable, always_inline)) float
+native_cos(float x) {
+    return __hsail_ncos_f32(x);
+}
+
+__attribute__((overloadable, always_inline)) float
+native_divide(float x, float y) {
+    return native_recip(y)*x;
+}
+
+__attribute__((overloadable, always_inline)) float
+native_exp2(float x) {
+    return __hsail_nexp2_f32(x);
+}
+
+__attribute__((overloadable,weak,always_inline)) float
+native_exp(float f) {
+  // There is no native exp in HSAIL, but we have exp2 instruction.
+  return __hsail_nexp2_f32(M_LOG2E_F*f);
+}
+
+__attribute__((overloadable,weak,always_inline)) float
+native_exp10(float f) {
+  // There is no native exp10 in HSAIL, but we have exp2 instruction.
+  return __hsail_nexp2_f32(M_LOG2_10_F*f);
+}
+
+__attribute__((overloadable, always_inline)) float
+native_log2(float x) {
+    return __hsail_nlog2_f32(x);
+}
+
+__attribute__((overloadable,weak,always_inline)) float
+native_log(float f) {
+  // There is no native log in HSAIL, but we have log2 instruction.
+  return __hsail_nlog2_f32(f)*M_RLOG2_E_F;
+}
+
+__attribute__((overloadable,weak,always_inline)) float
+native_log10(float f) {
+  // There is no native log10 in HSAIL, but we have log2 instruction.
+  return __hsail_nlog2_f32(f)*M_RLOG2_10_F;
+}
+
+__attribute__((overloadable, always_inline)) float
+native_powr(float x, float y)
+{
+    return native_exp2(native_log2(x)*y);
+}
+
+__attribute__((overloadable, always_inline)) float
+native_recip(float x) {
+    return __hsail_nrcp_f32(x);
+}
+
+__attribute__((overloadable, always_inline)) float
+native_rsqrt(float x)
+{
+    return __hsail_nrsqrt_f32(x);
+}
+
+__attribute__((overloadable, always_inline)) float
+native_sin(float x) {
+    return __hsail_nsin_f32(x);
+}
+
+__attribute__((overloadable, always_inline)) float
+native_sqrt(float x) {
+    return __hsail_nsqrt_f32(x);
+}
+
+extern __attribute__((pure)) float __amdil_tan_f32(float,float);
+__attribute__((overloadable, always_inline)) float
+native_tan(float x)
+{
+    return native_sin(x)*native_recip(native_cos(x));
+}

diff --git a/amd-builtins/math32/asinF.cl b/amd-builtins/math32/asinF.cl
new file mode 100644
index 0000000..58c3f57
--- /dev/null
+++ b/amd-builtins/math32/asinF.cl

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+asin(float x)
+{
+    // Computes arcsin(x).
+    // The argument is first reduced by noting that arcsin(x)
+    // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
+    // For denormal and small arguments arcsin(x) = x to machine
+    // accuracy. Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arcsin(x) = x + x^3*R(x^2)
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+
+    const float piby2_tail = 7.5497894159e-08F;   /* 0x33a22168 */
+    const float hpiby2_head = 7.8539812565e-01F;  /* 0x3f490fda */
+    const float piby2 = 1.5707963705e+00F;        /* 0x3fc90fdb */
+
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint xs = ux ^ aux;
+    float spiby2 = as_float(xs | as_uint(piby2));
+    int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+    float y = as_float(aux);
+
+    // abs(x) >= 0.5
+    int transform = xexp >= -1;
+
+    float y2 = y * y;
+    float rt = 0.5f * (1.0f - y);
+    float r = transform ? rt : y2;
+
+    // Use a rational approximation for [0.0, 0.5]
+    float a = mad(r,
+                  mad(r,
+                      mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F),
+                      -0.0565298683201845211985026327361F),
+                  0.184161606965100694821398249421F);
+
+    float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F);
+    float u = r * MATH_DIVIDE(a, b);
+
+    float s = MATH_SQRT(r);
+    float s1 = as_float(as_uint(s) & 0xffff0000);
+    float c = MATH_DIVIDE(mad(-s1, s1, r), s + s1);
+    float p = mad(2.0f*s, u, -mad(c, -2.0f, piby2_tail));
+    float q = mad(s1, -2.0f, hpiby2_head);
+    float vt = hpiby2_head - (p - q);
+    float v = mad(y, u, y);
+    v = transform ? vt : v;
+
+    float ret = as_float(xs | as_uint(v));
+    ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret;
+    ret = aux == 0x3f800000U ? spiby2 : ret;
+    ret = xexp < -14 ? x : ret;
+
+    return ret;
+}
+

diff --git a/amd-builtins/math32/asinhF.cl b/amd-builtins/math32/asinhF.cl
new file mode 100644
index 0000000..45c5d1f
--- /dev/null
+++ b/amd-builtins/math32/asinhF.cl

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+asinh(float x)
+{
+    uint ux = as_uint(x);
+    uint ax = ux & EXSIGNBIT_SP32;
+    uint xsgn = ax ^ ux;
+
+    // |x| <= 2
+    float t = x * x;
+    float a = mad(t,
+                  mad(t,
+		      mad(t,
+		          mad(t, -1.177198915954942694e-4f, -4.162727710583425360e-2f),
+		          -5.063201055468483248e-1f),
+		      -1.480204186473758321f),
+	          -1.152965835871758072f);
+    float b = mad(t,
+	          mad(t,
+		      mad(t,
+			  mad(t, 6.284381367285534560e-2f, 1.260024978680227945f),
+			  6.582362487198468066f),
+		      11.99423176003939087f),
+		  6.917795026025976739f);
+
+    float q = MATH_DIVIDE(a, b);
+    float z1 = mad(x*t, q, x);
+
+    // |x| > 2
+
+    // Arguments greater than 1/sqrt(epsilon) in magnitude are
+    // approximated by asinh(x) = ln(2) + ln(abs(x)), with sign of x
+    // Arguments such that 4.0 <= abs(x) <= 1/sqrt(epsilon) are
+    // approximated by asinhf(x) = ln(abs(x) + sqrt(x*x+1))
+    // with the sign of x (see Abramowitz and Stegun 4.6.20)
+
+    float absx = as_float(ax);
+    int hi = ax > 0x46000000U;
+    float y = MATH_SQRT(absx * absx + 1.0f) + absx;
+    y = hi ? absx : y;
+    float r = log(y) + (hi ? 0x1.62e430p-1f : 0.0f);
+    float z2 = as_float(xsgn | as_uint(r));
+
+    float z = ax <= 0x40000000 ? z1 : z2;
+    z = ax < 0x39800000U | ax >= PINFBITPATT_SP32 ? x : z;
+
+    return z;
+}
+

diff --git a/amd-builtins/math32/asinpiF.cl b/amd-builtins/math32/asinpiF.cl
new file mode 100644
index 0000000..009cdf4
--- /dev/null
+++ b/amd-builtins/math32/asinpiF.cl

@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+asinpi(float x)
+{
+    // Computes arcsin(x).
+    // The argument is first reduced by noting that arcsin(x)
+    // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
+    // For denormal and small arguments arcsin(x) = x to machine
+    // accuracy. Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arcsin(x) = x + x^3*R(x^2)
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+
+    const float pi = 3.1415926535897933e+00f;
+    const float piby2_tail = 7.5497894159e-08F;   /* 0x33a22168 */
+    const float hpiby2_head = 7.8539812565e-01F;  /* 0x3f490fda */
+
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint xs = ux ^ aux;
+    float shalf = as_float(xs | as_uint(0.5f));
+
+    int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+
+    float y = as_float(aux);
+
+    // abs(x) >= 0.5
+    int transform = xexp >= -1;
+
+    float y2 = y * y;
+    float rt = 0.5f * (1.0f - y);
+    float r = transform ? rt : y2;
+
+    // Use a rational approximation for [0.0, 0.5]
+    float a = mad(r,
+                  mad(r,
+                      mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F),
+                      -0.0565298683201845211985026327361F),
+                  0.184161606965100694821398249421F);
+    float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F);
+    float u = r * MATH_DIVIDE(a, b);
+
+    float s = MATH_SQRT(r);
+    float s1 = as_float(as_uint(s) & 0xffff0000);
+    float c = MATH_DIVIDE(mad(-s1, s1, r), s + s1);
+    float p = mad(2.0f*s, u, -mad(c, -2.0f, piby2_tail));
+    float q = mad(s1, -2.0f, hpiby2_head);
+    float vt = hpiby2_head - (p - q);
+    float v = mad(y, u, y);
+    v = transform ? vt : v;
+    v = MATH_DIVIDE(v, pi);
+    float xbypi = MATH_DIVIDE(x, pi);
+
+    float ret = as_float(xs | as_uint(v));
+    ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret;
+    ret = aux == 0x3f800000U ? shalf : ret;
+    ret = xexp < -14 ? xbypi : ret;
+
+    return ret;
+}
+

diff --git a/amd-builtins/math32/atan2F.cl b/amd-builtins/math32/atan2F.cl
new file mode 100644
index 0000000..82b3ac1
--- /dev/null
+++ b/amd-builtins/math32/atan2F.cl

@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+#ifndef TABLE_BASED_ATAN2 
+__attribute__((overloadable)) float
+atan2(float y, float x)
+{
+    const float pi = 0x1.921fb6p+1f;
+    const float piby2 = 0x1.921fb6p+0;
+    const float piby4 = 0x1.921fb6p-1f;
+    const float threepiby4 = 0x1.2d97c8p+1f;
+
+    float ax = fabs(x);
+    float ay = fabs(y);
+    float v = min(ax, ay);
+    float u = max(ax, ay);
+
+    // Scale since u could be large, as in "regular" divide
+    float s = u > 0x1.0p+96f ? 0x1.0p-32 : 1.0f;
+    float vbyu = s * MATH_DIVIDE(v, s*u);
+
+    float vbyu2 = vbyu * vbyu;
+
+#define USE_2_2_APPROXIMATION
+#if defined USE_2_2_APPROXIMATION
+    float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu;
+    float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
+#else
+    float p = mad(vbyu2, mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), -0x1.55554ep-2f) * vbyu2 * vbyu;
+    float q = mad(vbyu2, mad(vbyu2, mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), 0x1.76b4b8p+0f), 1.0f);
+#endif
+
+    // Octant 0 result
+    float a = mad(p, MATH_RECIP(q), vbyu);
+
+    // Fix up 3 other octants
+    float at = piby2 - a;
+    a = ay > ax ? at : a;
+    at = pi - a;
+    a = x < 0.0F ? at : a;
+
+    // y == 0 => 0 for x >= 0, pi for x < 0
+    at = as_int(x) < 0 ? pi : 0.0f;
+    a = y == 0.0f ? at : a;
+
+    // if (!FINITE_ONLY()) {
+        // x and y are +- Inf
+        at = x > 0.0f ? piby4 : threepiby4;
+        a = ax == INFINITY & ay == INFINITY ? at : a;
+
+	// x or y is NaN
+	a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a;
+    // }
+
+    // Fixup sign and return
+    return copysign(a, y);
+}
+#else
+__attribute__((overloadable)) float
+atan2(float y, float x)
+{
+    USE_TABLE(float, p_tbl, M32_ATAN2_JBY256);
+
+    // Explicitly flush arguments
+    x = FTZ(x);
+    y = FTZ(y);
+
+    uint uy = as_uint(y);
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint auy = uy & EXSIGNBIT_SP32;
+
+    // General case: take absolute values of arguments
+    float u = as_float(aux);
+    float v = as_float(auy);
+
+    // Swap u and v if necessary to obtain 0 < v < u
+    int swap_vu = u < v;
+    float uu = u;
+    u = swap_vu ? v : u;
+    v = swap_vu ? uu : v;
+
+    // Use full range division here because the reciprocal of u could be subnormal
+    float vbyu = v / u;
+
+    // Handle large quotient with table and polynomial approximation
+    int big = vbyu > 0.0625f;
+
+    int index = (int) mad(vbyu, 256.0f, 0.5f);
+    float findex = (float)index;
+    float r = MATH_DIVIDE(mad(vbyu, 256.0f, -findex), mad(vbyu, findex, 256.0f));
+    float s = r * r;
+    index = clamp(index-16, 0, 240);
+    float qbig = mad(r*s, -0.33333333333224095522f, r) + p_tbl[index];
+
+    // Handle small quotient with a series expansion
+    s = vbyu * vbyu;
+    float q = mad(s, -mad(s, -0.14285713561807169030f, 0.19999999999393223405f), 0.33333333333333170500f);
+    q = mad(vbyu*s, -q, vbyu);
+    q = big ? qbig : q;
+
+    // Tidy-up according to which quadrant the arguments lie in
+    const float piby2 = 1.5707963267948966e+00f;
+    float qt = piby2 - q;
+    q = swap_vu ? qt : q;
+
+    int xneg = ux != aux;
+    const float pi = 3.1415926535897932e+00f;
+    qt = pi - q;
+    q = xneg ? qt : q;
+
+    uint ysign = uy ^ auy;
+    q = as_float(ysign | as_uint(q));
+
+    // Now handle a few special cases
+    // Zero y gives +-0 for positive x and +-pi for negative x
+    qt = as_float(ysign | as_uint(pi));
+    qt = xneg ? qt : y;
+    q = y == 0.0f ? qt : q;
+
+    if (!FINITE_ONLY()) {
+        // If abs(x) and abs(y) are both infinity return +-pi/4 or +- 3pi/4 according to signs
+        const float piby4 = 7.8539816339744831e-01f;
+        const float three_piby4 = 2.3561944901923449e+00f;
+        qt = xneg ? three_piby4 : piby4;
+        qt = as_float(ysign | as_uint(qt));
+        q = auy == PINFBITPATT_SP32 & aux == PINFBITPATT_SP32 ? qt : q;
+    
+        // If either arg was NaN, return it
+        q = aux > PINFBITPATT_SP32 ? x : q;
+        q = auy > PINFBITPATT_SP32 ? y : q;
+    }
+
+    return q;
+}
+#endif
+

diff --git a/amd-builtins/math32/atan2F_table.h b/amd-builtins/math32/atan2F_table.h
new file mode 100644
index 0000000..e46527a
--- /dev/null
+++ b/amd-builtins/math32/atan2F_table.h

@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/* Array atan_jby256 contains precomputed values of atan(j/256),
+for j = 16, 17, ..., 256. */
+DECLARE_TABLE(float, ATAN2_TABLE_JBY256, 241,
+    6.24188099959573430842e-02f, /* 0x3faff55bb72cfde9 */
+    6.63088949198234745008e-02f, /* 0x3fb0f99ea71d52a6 */
+    7.01969710718705064423e-02f, /* 0x3fb1f86dbf082d58 */
+    7.40829225490337306415e-02f, /* 0x3fb2f719318a4a9a */
+    7.79666338315423007588e-02f, /* 0x3fb3f59f0e7c559d */
+    8.18479898030765457007e-02f, /* 0x3fb4f3fd677292fb */
+    8.57268757707448092464e-02f, /* 0x3fb5f2324fd2d7b2 */
+    8.96031774848717321724e-02f, /* 0x3fb6f03bdcea4b0c */
+    9.34767811585894559112e-02f, /* 0x3fb7ee182602f10e */
+    9.73475734872236708739e-02f, /* 0x3fb8ebc54478fb28 */
+    1.01215441667466668485e-01f, /* 0x3fb9e94153cfdcf1 */
+    1.05080273416329528224e-01f, /* 0x3fbae68a71c722b8 */
+    1.08941956989865793015e-01f, /* 0x3fbbe39ebe6f07c3 */
+    1.12800381201659388752e-01f, /* 0x3fbce07c5c3cca32 */
+    1.16655435441069349478e-01f, /* 0x3fbddd21701eba6e */
+    1.20507009691224548087e-01f, /* 0x3fbed98c2190043a */
+    1.24354994546761424279e-01f, /* 0x3fbfd5ba9aac2f6d */
+    1.28199281231298117811e-01f, /* 0x3fc068d584212b3d */
+    1.32039761614638734288e-01f, /* 0x3fc0e6adccf40881 */
+    1.35876328229701304195e-01f, /* 0x3fc1646541060850 */
+    1.39708874289163620386e-01f, /* 0x3fc1e1fafb043726 */
+    1.43537293701821222491e-01f, /* 0x3fc25f6e171a535c */
+    1.47361481088651630200e-01f, /* 0x3fc2dcbdb2fba1ff */
+    1.51181331798580037562e-01f, /* 0x3fc359e8edeb99a3 */
+    1.54996741923940972718e-01f, /* 0x3fc3d6eee8c6626c */
+    1.58807608315631065832e-01f, /* 0x3fc453cec6092a9e */
+    1.62613828597948567589e-01f, /* 0x3fc4d087a9da4f17 */
+    1.66415301183114927586e-01f, /* 0x3fc54d18ba11570a */
+    1.70211925285474380276e-01f, /* 0x3fc5c9811e3ec269 */
+    1.74003600935367680469e-01f, /* 0x3fc645bfffb3aa73 */
+    1.77790228992676047071e-01f, /* 0x3fc6c1d4898933d8 */
+    1.81571711160032150945e-01f, /* 0x3fc73dbde8a7d201 */
+    1.85347949995694760705e-01f, /* 0x3fc7b97b4bce5b02 */
+    1.89118848926083965578e-01f, /* 0x3fc8350be398ebc7 */
+    1.92884312257974643856e-01f, /* 0x3fc8b06ee2879c28 */
+    1.96644245190344985064e-01f, /* 0x3fc92ba37d050271 */
+    2.00398553825878511514e-01f, /* 0x3fc9a6a8e96c8626 */
+    2.04147145182116990236e-01f, /* 0x3fca217e601081a5 */
+    2.07889927202262986272e-01f, /* 0x3fca9c231b403279 */
+    2.11626808765629753628e-01f, /* 0x3fcb1696574d780b */
+    2.15357699697738047551e-01f, /* 0x3fcb90d7529260a2 */
+    2.19082510780057748701e-01f, /* 0x3fcc0ae54d768466 */
+    2.22801153759394493514e-01f, /* 0x3fcc84bf8a742e6d */
+    2.26513541356919617664e-01f, /* 0x3fccfe654e1d5395 */
+    2.30219587276843717927e-01f, /* 0x3fcd77d5df205736 */
+    2.33919206214733416127e-01f, /* 0x3fcdf110864c9d9d */
+    2.37612313865471241892e-01f, /* 0x3fce6a148e96ec4d */
+    2.41298826930858800743e-01f, /* 0x3fcee2e1451d980c */
+    2.44978663126864143473e-01f, /* 0x3fcf5b75f92c80dd */
+    2.48651741190513253521e-01f, /* 0x3fcfd3d1fc40dbe4 */
+    2.52317980886427151166e-01f, /* 0x3fd025fa510665b5 */
+    2.55977303013005474952e-01f, /* 0x3fd061eea03d6290 */
+    2.59629629408257511791e-01f, /* 0x3fd09dc597d86362 */
+    2.63274882955282396590e-01f, /* 0x3fd0d97ee509acb3 */
+    2.66912987587400396539e-01f, /* 0x3fd1151a362431c9 */
+    2.70543868292936529052e-01f, /* 0x3fd150973a9ce546 */
+    2.74167451119658789338e-01f, /* 0x3fd18bf5a30bf178 */
+    2.77783663178873208022e-01f, /* 0x3fd1c735212dd883 */
+    2.81392432649178403370e-01f, /* 0x3fd2025567e47c95 */
+    2.84993688779881237938e-01f, /* 0x3fd23d562b381041 */
+    2.88587361894077354396e-01f, /* 0x3fd278372057ef45 */
+    2.92173383391398755471e-01f, /* 0x3fd2b2f7fd9b5fe2 */
+    2.95751685750431536626e-01f, /* 0x3fd2ed987a823cfe */
+    2.99322202530807379706e-01f, /* 0x3fd328184fb58951 */
+    3.02884868374971361060e-01f, /* 0x3fd362773707ebcb */
+    3.06439619009630070945e-01f, /* 0x3fd39cb4eb76157b */
+    3.09986391246883430384e-01f, /* 0x3fd3d6d129271134 */
+    3.13525122985043869228e-01f, /* 0x3fd410cbad6c7d32 */
+    3.17055753209146973237e-01f, /* 0x3fd44aa436c2af09 */
+    3.20578221991156986359e-01f, /* 0x3fd4845a84d0c21b */
+    3.24092470489871664618e-01f, /* 0x3fd4bdee586890e6 */
+    3.27598440950530811477e-01f, /* 0x3fd4f75f73869978 */
+    3.31096076704132047386e-01f, /* 0x3fd530ad9951cd49 */
+    3.34585322166458920545e-01f, /* 0x3fd569d88e1b4cd7 */
+    3.38066122836825466713e-01f, /* 0x3fd5a2e0175e0f4e */
+    3.41538425296541714449e-01f, /* 0x3fd5dbc3fbbe768d */
+    3.45002177207105076295e-01f, /* 0x3fd614840309cfe1 */
+    3.48457327308122011278e-01f, /* 0x3fd64d1ff635c1c5 */
+    3.51903825414964732676e-01f, /* 0x3fd685979f5fa6fd */
+    3.55341622416168290144e-01f, /* 0x3fd6bdeac9cbd76c */
+    3.58770670270572189509e-01f, /* 0x3fd6f61941e4def0 */
+    3.62190922004212156882e-01f, /* 0x3fd72e22d53aa2a9 */
+    3.65602331706966821034e-01f, /* 0x3fd7660752817501 */
+    3.69004854528964421068e-01f, /* 0x3fd79dc6899118d1 */
+    3.72398446676754202311e-01f, /* 0x3fd7d5604b63b3f7 */
+    3.75783065409248884237e-01f, /* 0x3fd80cd46a14b1d0 */
+    3.79158669033441808605e-01f, /* 0x3fd84422b8df95d7 */
+    3.82525216899905096124e-01f, /* 0x3fd87b4b0c1ebedb */
+    3.85882669398073752109e-01f, /* 0x3fd8b24d394a1b25 */
+    3.89230987951320717144e-01f, /* 0x3fd8e92916f5cde8 */
+    3.92570135011828580396e-01f, /* 0x3fd91fde7cd0c662 */
+    3.95900074055262896078e-01f, /* 0x3fd9566d43a34907 */
+    3.99220769575252543149e-01f, /* 0x3fd98cd5454d6b18 */
+    4.02532187077682512832e-01f, /* 0x3fd9c3165cc58107 */
+    4.05834293074804064450e-01f, /* 0x3fd9f93066168001 */
+    4.09127055079168300278e-01f, /* 0x3fda2f233e5e530b */
+    4.12410441597387267265e-01f, /* 0x3fda64eec3cc23fc */
+    4.15684422123729413467e-01f, /* 0x3fda9a92d59e98cf */
+    4.18948967133552840902e-01f, /* 0x3fdad00f5422058b */
+    4.22204048076583571270e-01f, /* 0x3fdb056420ae9343 */
+    4.25449637370042266227e-01f, /* 0x3fdb3a911da65c6c */
+    4.28685708391625730496e-01f, /* 0x3fdb6f962e737efb */
+    4.31912235472348193799e-01f, /* 0x3fdba473378624a5 */
+    4.35129193889246812521e-01f, /* 0x3fdbd9281e528191 */
+    4.38336559857957774877e-01f, /* 0x3fdc0db4c94ec9ef */
+    4.41534310525166673322e-01f, /* 0x3fdc42191ff11eb6 */
+    4.44722423960939305942e-01f, /* 0x3fdc76550aad71f8 */
+    4.47900879150937292206e-01f, /* 0x3fdcaa6872f3631b */
+    4.51069655988523443568e-01f, /* 0x3fdcde53432c1350 */
+    4.54228735266762495559e-01f, /* 0x3fdd121566b7f2ad */
+    4.57378098670320809571e-01f, /* 0x3fdd45aec9ec862b */
+    4.60517728767271039558e-01f, /* 0x3fdd791f5a1226f4 */
+    4.63647609000806093515e-01f, /* 0x3fddac670561bb4f */
+    4.66767723680866497560e-01f, /* 0x3fdddf85bb026974 */
+    4.69878057975686880265e-01f, /* 0x3fde127b6b0744af */
+    4.72978597903265574054e-01f, /* 0x3fde4548066cf51a */
+    4.76069330322761219421e-01f, /* 0x3fde77eb7f175a34 */
+    4.79150242925822533735e-01f, /* 0x3fdeaa65c7cf28c4 */
+    4.82221324227853687105e-01f, /* 0x3fdedcb6d43f8434 */
+    4.85282563559221225002e-01f, /* 0x3fdf0ede98f393cf */
+    4.88333951056405479729e-01f, /* 0x3fdf40dd0b541417 */
+    4.91375477653101910835e-01f, /* 0x3fdf72b221a4e495 */
+    4.94407135071275316562e-01f, /* 0x3fdfa45dd3029258 */
+    4.97428915812172245392e-01f, /* 0x3fdfd5e0175fdf83 */
+    5.00440813147294050189e-01f, /* 0x3fe0039c73c1a40b */
+    5.03442821109336358099e-01f, /* 0x3fe01c341e82422d */
+    5.06434934483096732549e-01f, /* 0x3fe034b709250488 */
+    5.09417148796356245022e-01f, /* 0x3fe04d25314342e5 */
+    5.12389460310737621107e-01f, /* 0x3fe0657e94db30cf */
+    5.15351866012543347040e-01f, /* 0x3fe07dc3324e9b38 */
+    5.18304363603577900044e-01f, /* 0x3fe095f30861a58f */
+    5.21246951491958210312e-01f, /* 0x3fe0ae0e1639866c */
+    5.24179628782913242802e-01f, /* 0x3fe0c6145b5b43da */
+    5.27102395269579471204e-01f, /* 0x3fe0de05d7aa6f7c */
+    5.30015251423793132268e-01f, /* 0x3fe0f5e28b67e295 */
+    5.32918198386882147055e-01f, /* 0x3fe10daa77307a0d */
+    5.35811237960463593311e-01f, /* 0x3fe1255d9bfbd2a8 */
+    5.38694372597246617929e-01f, /* 0x3fe13cfbfb1b056e */
+    5.41567605391844897333e-01f, /* 0x3fe1548596376469 */
+    5.44430940071603086672e-01f, /* 0x3fe16bfa6f5137e1 */
+    5.47284380987436924748e-01f, /* 0x3fe1835a88be7c13 */
+    5.50127933104692989907e-01f, /* 0x3fe19aa5e5299f99 */
+    5.52961601994028217888e-01f, /* 0x3fe1b1dc87904284 */
+    5.55785393822313511514e-01f, /* 0x3fe1c8fe7341f64f */
+    5.58599315343562330405e-01f, /* 0x3fe1e00babdefeb3 */
+    5.61403373889889367732e-01f, /* 0x3fe1f7043557138a */
+    5.64197577362497537656e-01f, /* 0x3fe20de813e823b1 */
+    5.66981934222700489912e-01f, /* 0x3fe224b74c1d192a */
+    5.69756453482978431069e-01f, /* 0x3fe23b71e2cc9e6a */
+    5.72521144698072359525e-01f, /* 0x3fe25217dd17e501 */
+    5.75276017956117824426e-01f, /* 0x3fe268a940696da6 */
+    5.78021083869819540801e-01f, /* 0x3fe27f261273d1b3 */
+    5.80756353567670302596e-01f, /* 0x3fe2958e59308e30 */
+    5.83481838685214859730e-01f, /* 0x3fe2abe21aded073 */
+    5.86197551356360535557e-01f, /* 0x3fe2c2215e024465 */
+    5.88903504204738026395e-01f, /* 0x3fe2d84c2961e48b */
+    5.91599710335111383941e-01f, /* 0x3fe2ee628406cbca */
+    5.94286183324841177367e-01f, /* 0x3fe30464753b090a */
+    5.96962937215401501234e-01f, /* 0x3fe31a52048874be */
+    5.99629986503951384336e-01f, /* 0x3fe3302b39b78856 */
+    6.02287346134964152178e-01f, /* 0x3fe345f01cce37bb */
+    6.04935031491913965951e-01f, /* 0x3fe35ba0b60eccce */
+    6.07573058389022313541e-01f, /* 0x3fe3713d0df6c503 */
+    6.10201443063065118722e-01f, /* 0x3fe386c52d3db11e */
+    6.12820202165241245673e-01f, /* 0x3fe39c391cd41719 */
+    6.15429352753104952356e-01f, /* 0x3fe3b198e5e2564a */
+    6.18028912282561737612e-01f, /* 0x3fe3c6e491c78dc4 */
+    6.20618898599929469384e-01f, /* 0x3fe3dc1c2a188504 */
+    6.23199329934065904268e-01f, /* 0x3fe3f13fb89e96f4 */
+    6.25770224888563042498e-01f, /* 0x3fe4064f47569f48 */
+    6.28331602434009650615e-01f, /* 0x3fe41b4ae06fea41 */
+    6.30883481900321840818e-01f, /* 0x3fe430328e4b26d5 */
+    6.33425882969144482537e-01f, /* 0x3fe445065b795b55 */
+    6.35958825666321447834e-01f, /* 0x3fe459c652badc7f */
+    6.38482330354437466191e-01f, /* 0x3fe46e727efe4715 */
+    6.40996417725432032775e-01f, /* 0x3fe4830aeb5f7bfd */
+    6.43501108793284370968e-01f, /* 0x3fe4978fa3269ee1 */
+    6.45996424886771558604e-01f, /* 0x3fe4ac00b1c71762 */
+    6.48482387642300484032e-01f, /* 0x3fe4c05e22de94e4 */
+    6.50959018996812410762e-01f, /* 0x3fe4d4a8023414e8 */
+    6.53426341180761927063e-01f, /* 0x3fe4e8de5bb6ec04 */
+    6.55884376711170835605e-01f, /* 0x3fe4fd013b7dd17e */
+    6.58333148384755983962e-01f, /* 0x3fe51110adc5ed81 */
+    6.60772679271132590273e-01f, /* 0x3fe5250cbef1e9fa */
+    6.63202992706093175102e-01f, /* 0x3fe538f57b89061e */
+    6.65624112284960989250e-01f, /* 0x3fe54ccaf0362c8f */
+    6.68036061856020157990e-01f, /* 0x3fe5608d29c70c34 */
+    6.70438865514021320458e-01f, /* 0x3fe5743c352b33b9 */
+    6.72832547593763097282e-01f, /* 0x3fe587d81f732fba */
+    6.75217132663749830535e-01f, /* 0x3fe59b60f5cfab9d */
+    6.77592645519925151909e-01f, /* 0x3fe5aed6c5909517 */
+    6.79959111179481823228e-01f, /* 0x3fe5c2399c244260 */
+    6.82316554874748071313e-01f, /* 0x3fe5d58987169b18 */
+    6.84665002047148862907e-01f, /* 0x3fe5e8c6941043cf */
+    6.87004478341244895212e-01f, /* 0x3fe5fbf0d0d5cc49 */
+    6.89335009598845749323e-01f, /* 0x3fe60f084b46e05e */
+    6.91656621853199760075e-01f, /* 0x3fe6220d115d7b8d */
+    6.93969341323259825138e-01f, /* 0x3fe634ff312d1f3b */
+    6.96273194408023488045e-01f, /* 0x3fe647deb8e20b8f */
+    6.98568207680949848637e-01f, /* 0x3fe65aabb6c07b02 */
+    7.00854407884450081312e-01f, /* 0x3fe66d663923e086 */
+    7.03131821924453670469e-01f, /* 0x3fe6800e4e7e2857 */
+    7.05400476865049030906e-01f, /* 0x3fe692a40556fb6a */
+    7.07660399923197958039e-01f, /* 0x3fe6a5276c4b0575 */
+    7.09911618463524796141e-01f, /* 0x3fe6b798920b3d98 */
+    7.12154159993178659249e-01f, /* 0x3fe6c9f7855c3198 */
+    7.14388052156768926793e-01f, /* 0x3fe6dc44551553ae */
+    7.16613322731374569052e-01f, /* 0x3fe6ee7f10204aef */
+    7.18829999621624415873e-01f, /* 0x3fe700a7c5784633 */
+    7.21038110854851588272e-01f, /* 0x3fe712be84295198 */
+    7.23237684576317874097e-01f, /* 0x3fe724c35b4fae7b */
+    7.25428749044510712274e-01f, /* 0x3fe736b65a172dff */
+    7.27611332626510676214e-01f, /* 0x3fe748978fba8e0f */
+    7.29785463793429123314e-01f, /* 0x3fe75a670b82d8d8 */
+    7.31951171115916565668e-01f, /* 0x3fe76c24dcc6c6c0 */
+    7.34108483259739652560e-01f, /* 0x3fe77dd112ea22c7 */
+    7.36257428981428097003e-01f, /* 0x3fe78f6bbd5d315e */
+    7.38398037123989547936e-01f, /* 0x3fe7a0f4eb9c19a2 */
+    7.40530336612692630105e-01f, /* 0x3fe7b26cad2e50fd */
+    7.42654356450917929600e-01f, /* 0x3fe7c3d311a6092b */
+    7.44770125716075148681e-01f, /* 0x3fe7d528289fa093 */
+    7.46877673555587429099e-01f, /* 0x3fe7e66c01c114fd */
+    7.48977029182941400620e-01f, /* 0x3fe7f79eacb97898 */
+    7.51068221873802288613e-01f, /* 0x3fe808c03940694a */
+    7.53151280962194302759e-01f, /* 0x3fe819d0b7158a4c */
+    7.55226235836744863583e-01f, /* 0x3fe82ad036000005 */
+    7.57293115936992444759e-01f, /* 0x3fe83bbec5cdee22 */
+    7.59351950749757920178e-01f, /* 0x3fe84c9c7653f7ea */
+    7.61402769805578416573e-01f, /* 0x3fe85d69576cc2c5 */
+    7.63445602675201784315e-01f, /* 0x3fe86e2578f87ae5 */
+    7.65480478966144461950e-01f, /* 0x3fe87ed0eadc5a2a */
+    7.67507428319308182552e-01f, /* 0x3fe88f6bbd023118 */
+    7.69526480405658186434e-01f, /* 0x3fe89ff5ff57f1f7 */
+    7.71537664922959498526e-01f, /* 0x3fe8b06fc1cf3dfe */
+    7.73541011592573490852e-01f, /* 0x3fe8c0d9145cf49d */
+    7.75536550156311621507e-01f, /* 0x3fe8d13206f8c4ca */
+    7.77524310373347682379e-01f, /* 0x3fe8e17aa99cc05d */
+    7.79504322017186335181e-01f, /* 0x3fe8f1b30c44f167 */
+    7.81476614872688268854e-01f, /* 0x3fe901db3eeef187 */
+    7.83441218733151756304e-01f, /* 0x3fe911f35199833b */
+    7.85398163397448278999e-01f, /* 0x3fe921fb54442d18 */
+)
+

diff --git a/amd-builtins/math32/atan2piF.cl b/amd-builtins/math32/atan2piF.cl
new file mode 100644
index 0000000..372e31f
--- /dev/null
+++ b/amd-builtins/math32/atan2piF.cl

@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+#ifndef TABLE_BASED_ATAN2
+__attribute__((overloadable)) float
+atan2pi(float y, float x)
+{
+    const float pi = 0x1.921fb6p+1f;
+
+    float ax = fabs(x);
+    float ay = fabs(y);
+    float v = min(ax, ay);
+    float u = max(ax, ay);
+
+    // Scale since u could be large, as in "regular" divide
+    float s = u > 0x1.0p+96f ? 0x1.0p-32 : 1.0f;
+    float vbyu = s * MATH_DIVIDE(v, s*u);
+
+    float vbyu2 = vbyu * vbyu;
+
+#define USE_2_2_APPROXIMATION
+#if defined USE_2_2_APPROXIMATION
+    float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu;
+    float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
+#else
+    float p = mad(vbyu2, mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), -0x1.55554ep-2f) * vbyu2 * vbyu;
+    float q = mad(vbyu2, mad(vbyu2, mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), 0x1.76b4b8p+0f), 1.0f);
+#endif
+
+    // Octant 0 result
+    float a = MATH_DIVIDE(mad(p, MATH_RECIP(q), vbyu), pi);
+
+    // Fix up 3 other octants
+    float at = 0.5f - a;
+    a = ay > ax ? at : a;
+    at = 1.0f - a;
+    a = x < 0.0F ? at : a;
+
+    // y == 0 => 0 for x >= 0, pi for x < 0
+    at = as_int(x) < 0 ? 1.0f : 0.0f;
+    a = y == 0.0f ? at : a;
+
+    // if (!FINITE_ONLY()) {
+        // x and y are +- Inf
+        at = x > 0.0f ? 0.25f : 0.75f;
+        a = ax == INFINITY & ay == INFINITY ? at : a;
+
+	// x or y is NaN
+	a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a;
+    // }
+
+    // Fixup sign and return
+    return copysign(a, y);
+}
+#else
+__attribute__((overloadable)) float
+atan2pi(float y, float x)
+{
+    USE_TABLE(float, p_tbl, M32_ATAN2_JBY256);
+
+    // Explicitly flush arguments
+    x = FTZ(x);
+    y = FTZ(y);
+
+    uint uy = as_uint(y);
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint auy = uy & EXSIGNBIT_SP32;
+
+    // General case: take absolute values of arguments
+    float u = as_float(aux);
+    float v = as_float(auy);
+
+    // Swap u and v if necessary to obtain 0 < v < u
+    int swap_vu = u < v;
+    float uu = u;
+    u = swap_vu ? v : u;
+    v = swap_vu ? uu : v;
+
+    // Use full range division here because the reciprocal of u could be subnormal
+    float vbyu = v / u;
+
+    // Handle large quotient with table and polynomial approximation
+    int big = vbyu > 0.0625f;
+
+    int index = (int) mad(vbyu, 256.0f, 0.5f);
+    float findex = (float)index;
+    float r = MATH_DIVIDE(mad(vbyu, 256.0f, -findex), mad(vbyu, findex, 256.0f));
+    float s = r * r;
+    index = clamp(index-16, 0, 240);
+    float qbig = mad(r*s, -0.33333333333224095522f, r) + p_tbl[index];
+
+    // Handle small quotient with a series expansion
+    s = vbyu * vbyu;
+    float q = mad(s, -mad(s, -0.14285713561807169030f, 0.19999999999393223405f), 0.33333333333333170500f);
+    q = mad(vbyu*s, -q, vbyu);
+    q = big ? qbig : q;
+
+    const float pi = 3.1415926535897932e+00f;
+    q = MATH_DIVIDE(q, pi);
+
+    // Tidy-up according to which quadrant the arguments lie in
+    float qt = 0.5f - q;
+    q = swap_vu ? qt : q;
+
+    int xneg = ux != aux;
+    qt = 1.0f - q;
+    q = xneg ? qt : q;
+
+    uint ysign = uy ^ auy;
+    q = as_float(ysign | as_uint(q));
+
+    // Now handle a few special cases
+    // Zero y gives +-0 for positive x and +-pi for negative x
+    qt = as_float(ysign | 0x3f800000);
+    qt = xneg ? qt : y;
+    q = y == 0.0f ? qt : q;
+
+    if (!FINITE_ONLY()) {
+        // If abs(x) and abs(y) are both infinity return +-pi/4 or +- 3pi/4 according to signs
+        qt = xneg ? 0.75f : 0.25f;
+        qt = as_float(ysign | as_uint(qt));
+        q = auy == PINFBITPATT_SP32 & aux == PINFBITPATT_SP32 ? qt : q;
+    
+        // If either arg was NaN, return it
+        q = aux > PINFBITPATT_SP32 ? x : q;
+        q = auy > PINFBITPATT_SP32 ? y : q;
+    }
+
+    return q;
+}
+#endif

diff --git a/amd-builtins/math32/atanF.cl b/amd-builtins/math32/atanF.cl
new file mode 100644
index 0000000..81bac63
--- /dev/null
+++ b/amd-builtins/math32/atanF.cl

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+atan(float x)
+{
+    const float piby2 = 1.5707963267948966f; // 0x3ff921fb54442d18
+
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint sx = ux ^ aux;
+
+    float spiby2 = as_float(sx | as_uint(piby2));
+
+    float v = as_float(aux);
+
+    // Return for NaN
+    float ret = x;
+
+    // 2^26 <= |x| <= Inf => atan(x) is close to piby2
+    ret = aux <= PINFBITPATT_SP32  ? spiby2 : ret;
+
+    // Reduce arguments 2^-19 <= |x| < 2^26
+
+    // 39/16 <= x < 2^26
+    x = -MATH_RECIP(v);
+    float c = 1.57079632679489655800f; // atan(infinity)
+
+    // 19/16 <= x < 39/16
+    int l = aux < 0x401c0000;
+    float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f));
+    x = l ? xx : x;
+    c = l ? 9.82793723247329054082e-01f : c; // atan(1.5)
+
+    // 11/16 <= x < 19/16
+    l = aux < 0x3f980000U;
+    xx =  MATH_DIVIDE(v - 1.0f, 1.0f + v);
+    x = l ? xx : x;
+    c = l ? 7.85398163397448278999e-01f : c; // atan(1)
+
+    // 7/16 <= x < 11/16
+    l = aux < 0x3f300000;
+    xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v);
+    x = l ? xx : x;
+    c = l ? 4.63647609000806093515e-01f : c; // atan(0.5)
+
+    // 2^-19 <= x < 7/16
+    l = aux < 0x3ee00000;
+    x = l ? v : x;
+    c = l ? 0.0f : c;
+
+    // Core approximation: Remez(2,2) on [-7/16,7/16]
+
+    float s = x * x;
+    float a = mad(s,
+                  mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f),
+                  0.296528598819239217902158651186f);
+
+    float b = mad(s,
+                  mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f),
+                  0.889585796862432286486651434570f);
+
+    float q = x * s * MATH_DIVIDE(a, b);
+
+    float z = c - (q - x);
+    float zs = as_float(sx | as_uint(z));
+
+    ret  = aux < 0x4c800000 ?  zs : ret;
+
+    // |x| < 2^-19
+    ret = aux < 0x36000000 ? as_float(ux) : ret;
+    return ret;
+}
+

diff --git a/amd-builtins/math32/atanhF.cl b/amd-builtins/math32/atanhF.cl
new file mode 100644
index 0000000..ca78c58
--- /dev/null
+++ b/amd-builtins/math32/atanhF.cl

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+atanh(float x)
+{
+    uint ux = as_uint(x);
+    uint ax = ux & EXSIGNBIT_SP32;
+    uint xs = ux ^ ax;
+
+    // |x| > 1 or NaN
+    float z = as_float(QNANBITPATT_SP32);
+
+    // |x| == 1
+    float t = as_float(xs | PINFBITPATT_SP32);
+    z = ax == 0x3f800000U ? t : z;
+
+    // 1/2 <= |x| < 1
+    t = as_float(ax);
+    t = MATH_DIVIDE(2.0f*t, 1.0f - t);
+    t = 0.5f * log1p(t);
+    t = as_float(xs | as_uint(t));
+    z = ax < 0x3f800000U ? t : z;
+
+    // |x| < 1/2
+    t = x * x;
+    float a = mad(mad(0.92834212715e-2f, t, -0.28120347286e0f), t, 0.39453629046e0f);
+    float b = mad(mad(0.45281890445e0f, t, -0.15537744551e1f), t, 0.11836088638e1f);
+    float p = MATH_DIVIDE(a, b);
+    t = mad(x*t, p, x);
+    z = ax < 0x3f000000 ? t : z;
+
+    // |x| < 2^-13
+    z = ax < 0x39000000U ? x : z;
+
+    return z;
+}
+

diff --git a/amd-builtins/math32/atanpiF.cl b/amd-builtins/math32/atanpiF.cl
new file mode 100644
index 0000000..064554f
--- /dev/null
+++ b/amd-builtins/math32/atanpiF.cl

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+atanpi(float x)
+{
+    const float pi = 3.1415926535897932f;
+
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint sx = ux ^ aux;
+
+    float xbypi = MATH_DIVIDE(x, pi);
+    float shalf = as_float(sx | as_uint(0.5f));
+
+    float v = as_float(aux);
+
+    // Return for NaN
+    float ret = x;
+
+    // 2^26 <= |x| <= Inf => atan(x) is close to piby2
+    ret = aux <= PINFBITPATT_SP32  ? shalf : ret;
+
+    // Reduce arguments 2^-19 <= |x| < 2^26
+
+    // 39/16 <= x < 2^26
+    x = -MATH_RECIP(v);
+    float c = 1.57079632679489655800f; // atan(infinity)
+
+    // 19/16 <= x < 39/16
+    int l = aux < 0x401c0000;
+    float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f));
+    x = l ? xx : x;
+    c = l ? 9.82793723247329054082e-01f : c; // atan(1.5)
+
+    // 11/16 <= x < 19/16
+    l = aux < 0x3f980000U;
+    xx =  MATH_DIVIDE(v - 1.0f, 1.0f + v);
+    x = l ? xx : x;
+    c = l ? 7.85398163397448278999e-01f : c; // atan(1)
+
+    // 7/16 <= x < 11/16
+    l = aux < 0x3f300000;
+    xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v);
+    x = l ? xx : x;
+    c = l ? 4.63647609000806093515e-01f : c; // atan(0.5)
+
+    // 2^-19 <= x < 7/16
+    l = aux < 0x3ee00000;
+    x = l ? v : x;
+    c = l ? 0.0f : c;
+
+    // Core approximation: Remez(2,2) on [-7/16,7/16]
+
+    float s = x * x;
+    float a = mad(s,
+                  mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f),
+                  0.296528598819239217902158651186f);
+
+    float b = mad(s,
+                  mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f),
+                  0.889585796862432286486651434570f);
+
+    float q = x * s * MATH_DIVIDE(a, b);
+
+    float z = c - (q - x);
+    z = MATH_DIVIDE(z, pi);
+    float zs = as_float(sx | as_uint(z));
+
+    ret  = aux < 0x4c800000 ?  zs : ret;
+
+    // |x| < 2^-19
+    ret = aux < 0x36000000 ? xbypi : ret;
+    return ret;
+}
+

diff --git a/amd-builtins/math32/cbrtF.cl b/amd-builtins/math32/cbrtF.cl
new file mode 100644
index 0000000..968d504
--- /dev/null
+++ b/amd-builtins/math32/cbrtF.cl

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+// Algorithm:
+// 
+// x = (2^m)*A
+// x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8))
+// x = (2^m)*2*(G/2+g/2)
+// x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9))
+// 
+// Y = (2^(-1))*(2^(-m))*(2^m)*A
+// Now, range of Y is: 0.5 <= Y < 1
+// 
+// F = 0x100 + (first 7 mantissa bits) + (8th mantissa bit)
+// Now, range of F is: 128 <= F <= 256
+// F = F / 256
+// Now, range of F is: 0.5 <= F <= 1
+// 
+// f = (Y-F), with (f <= 2^(-9))
+// 
+// cbrt(x) = cbrt(2^m) * cbrt(2) * cbrt(F+f)
+// cbrt(x) = cbrt(2^m) * cbrt(2) * cbrt(F) + cbrt(1+(f/F))
+// cbrt(x) = cbrt(2^m) * cbrt(2*F) * cbrt(1+r)
+// 
+// r = (f/F), with (r <= 2^(-8))
+// r = f*(1/F) with (1/F) precomputed to avoid division
+// 
+// cbrt(x) = cbrt(2^m) * cbrt(G) * (1+poly)
+// 
+// poly = c1*r + c2*(r^2) + c3*(r^3) + c4*(r^4) + c5*(r^5) + c6*(r^6)
+
+__attribute__((overloadable)) float
+cbrt(float x)
+{
+    USE_TABLE(float2, p_cbrt, CBRT_TBL);
+    USE_TABLE(float, p_log_inv, LOG_INV_TBL);
+
+    uint xi = as_uint(x);
+    uint axi = xi & EXSIGNBIT_SP32;
+    uint xsign = axi ^ xi;
+    xi = axi;
+
+    int m = (xi >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+
+    // Treat subnormals
+    uint xisub = as_uint(as_float(xi | 0x3f800000) - 1.0f);
+    int msub = (xisub >> EXPSHIFTBITS_SP32) - 253;
+    int c = m == -127;
+    xi = c ? xisub : xi;
+    m = c ? msub : m;
+
+    int m3 = m / 3;
+    int rem = m - m3*3;
+    float mf = as_float((m3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
+
+    uint indx = (xi & 0x007f0000) + ((xi & 0x00008000) << 1);
+    float f = as_float((xi & MANTBITS_SP32) | 0x3f000000) - as_float(indx | 0x3f000000);
+
+    indx >>= 16;
+    float r = f * p_log_inv[indx];
+    float poly = mad(mad(r, 0x1.f9add4p-5f, -0x1.c71c72p-4f), r*r, r * 0x1.555556p-2f);
+
+    // This could also be done with a 5-element table
+    float remH = 0x1.428000p-1f;
+    float remT = 0x1.45f31ap-14f;
+
+    remH = rem == -1 ? 0x1.964000p-1f : remH;
+    remT = rem == -1 ? 0x1.fea53ep-13f : remT;
+
+    remH = rem ==  0 ? 0x1.000000p+0f : remH;
+    remT = rem ==  0 ? 0x0.000000p+0f  : remT;
+
+    remH = rem ==  1 ? 0x1.428000p+0f : remH;
+    remT = rem ==  1 ? 0x1.45f31ap-13f : remT;
+
+    remH = rem ==  2 ? 0x1.964000p+0f : remH;
+    remT = rem ==  2 ? 0x1.fea53ep-12f : remT;
+
+    float2 tv = p_cbrt[indx];
+    float cbrtH = tv.s0;
+    float cbrtT = tv.s1;
+
+    float bH = cbrtH * remH;
+    float bT = mad(cbrtH, remT, mad(cbrtT, remH, cbrtT*remT));
+
+    float z = mad(poly, bH, mad(poly, bT, bT)) + bH;
+    z *= mf;
+    z = as_float(as_uint(z) | xsign);
+    c = axi >= EXPBITS_SP32 | axi == 0;
+    z = c ? x : z;
+    return z;
+}
+

diff --git a/amd-builtins/math32/cbrtF_table.h b/amd-builtins/math32/cbrtF_table.h
new file mode 100644
index 0000000..209d2b2
--- /dev/null
+++ b/amd-builtins/math32/cbrtF_table.h

@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+DECLARE_TABLE(float2, CBRT_TBL, 129,
+    (float2)(0x1.000000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.008000p+0f, 0x1.51cb0ap-11f),
+    (float2)(0x1.014000p+0f, 0x1.39221ep-12f),
+    (float2)(0x1.01c000p+0f, 0x1.e06908p-11f),
+    (float2)(0x1.028000p+0f, 0x1.1d6978p-11f),
+    (float2)(0x1.034000p+0f, 0x1.4ea1bep-13f),
+    (float2)(0x1.03c000p+0f, 0x1.833b8ep-11f),
+    (float2)(0x1.048000p+0f, 0x1.587002p-12f),
+    (float2)(0x1.050000p+0f, 0x1.ceb290p-11f),
+    (float2)(0x1.05c000p+0f, 0x1.d57f34p-12f),
+    (float2)(0x1.068000p+0f, 0x1.cc53acp-21f),
+    (float2)(0x1.070000p+0f, 0x1.0fe098p-11f),
+    (float2)(0x1.07c000p+0f, 0x1.91b586p-15f),
+    (float2)(0x1.084000p+0f, 0x1.1c362ep-11f),
+    (float2)(0x1.090000p+0f, 0x1.94398ep-15f),
+    (float2)(0x1.098000p+0f, 0x1.1055bcp-11f),
+    (float2)(0x1.0a4000p+0f, 0x1.7e63cap-19f),
+    (float2)(0x1.0ac000p+0f, 0x1.d99e1ap-12f),
+    (float2)(0x1.0b4000p+0f, 0x1.d258dep-11f),
+    (float2)(0x1.0c0000p+0f, 0x1.645962p-12f),
+    (float2)(0x1.0c8000p+0f, 0x1.8c5b0ep-11f),
+    (float2)(0x1.0d4000p+0f, 0x1.83d0c8p-13f),
+    (float2)(0x1.0dc000p+0f, 0x1.300812p-11f),
+    (float2)(0x1.0e4000p+0f, 0x1.f9a65ap-11f),
+    (float2)(0x1.0f0000p+0f, 0x1.7bbcd8p-12f),
+    (float2)(0x1.0f8000p+0f, 0x1.7cbf68p-11f),
+    (float2)(0x1.104000p+0f, 0x1.b2c166p-14f),
+    (float2)(0x1.10c000p+0f, 0x1.d56ea4p-12f),
+    (float2)(0x1.114000p+0f, 0x1.99eb32p-11f),
+    (float2)(0x1.120000p+0f, 0x1.1007a2p-13f),
+    (float2)(0x1.128000p+0f, 0x1.d212aap-12f),
+    (float2)(0x1.130000p+0f, 0x1.890f18p-11f),
+    (float2)(0x1.13c000p+0f, 0x1.2104e2p-14f),
+    (float2)(0x1.144000p+0f, 0x1.74961ep-12f),
+    (float2)(0x1.14c000p+0f, 0x1.4b9b66p-11f),
+    (float2)(0x1.154000p+0f, 0x1.d81e66p-11f),
+    (float2)(0x1.160000p+0f, 0x1.7f825cp-13f),
+    (float2)(0x1.168000p+0f, 0x1.c5dca2p-12f),
+    (float2)(0x1.170000p+0f, 0x1.6153bap-11f),
+    (float2)(0x1.178000p+0f, 0x1.db1cc2p-11f),
+    (float2)(0x1.184000p+0f, 0x1.4154b0p-13f),
+    (float2)(0x1.18c000p+0f, 0x1.821114p-12f),
+    (float2)(0x1.194000p+0f, 0x1.2d4240p-11f),
+    (float2)(0x1.19c000p+0f, 0x1.950d82p-11f),
+    (float2)(0x1.1a4000p+0f, 0x1.f8755cp-11f),
+    (float2)(0x1.1b0000p+0f, 0x1.5e12a4p-13f),
+    (float2)(0x1.1b8000p+0f, 0x1.648c38p-12f),
+    (float2)(0x1.1c0000p+0f, 0x1.08c43ep-11f),
+    (float2)(0x1.1c8000p+0f, 0x1.5b0970p-11f),
+    (float2)(0x1.1d0000p+0f, 0x1.a91fe8p-11f),
+    (float2)(0x1.1d8000p+0f, 0x1.f311b6p-11f),
+    (float2)(0x1.1e4000p+0f, 0x1.c74618p-14f),
+    (float2)(0x1.1ec000p+0f, 0x1.eabb54p-13f),
+    (float2)(0x1.1f4000p+0f, 0x1.70db14p-12f),
+    (float2)(0x1.1fc000p+0f, 0x1.e45cbcp-12f),
+    (float2)(0x1.204000p+0f, 0x1.27faa6p-11f),
+    (float2)(0x1.20c000p+0f, 0x1.59db98p-11f),
+    (float2)(0x1.214000p+0f, 0x1.87da46p-11f),
+    (float2)(0x1.21c000p+0f, 0x1.b1ffa0p-11f),
+    (float2)(0x1.224000p+0f, 0x1.d85478p-11f),
+    (float2)(0x1.22c000p+0f, 0x1.fae17ep-11f),
+    (float2)(0x1.238000p+0f, 0x1.9af40cp-15f),
+    (float2)(0x1.240000p+0f, 0x1.a6319ep-14f),
+    (float2)(0x1.248000p+0f, 0x1.30baa6p-13f),
+    (float2)(0x1.250000p+0f, 0x1.7fc362p-13f),
+    (float2)(0x1.258000p+0f, 0x1.c05362p-13f),
+    (float2)(0x1.260000p+0f, 0x1.f28a98p-13f),
+    (float2)(0x1.268000p+0f, 0x1.0b4442p-12f),
+    (float2)(0x1.270000p+0f, 0x1.16361ap-12f),
+    (float2)(0x1.278000p+0f, 0x1.1a2a2ap-12f),
+    (float2)(0x1.280000p+0f, 0x1.172f8ep-12f),
+    (float2)(0x1.288000p+0f, 0x1.0d5530p-12f),
+    (float2)(0x1.290000p+0f, 0x1.f9538ep-13f),
+    (float2)(0x1.298000p+0f, 0x1.ca77b0p-13f),
+    (float2)(0x1.2a0000p+0f, 0x1.8e336ap-13f),
+    (float2)(0x1.2a8000p+0f, 0x1.44a304p-13f),
+    (float2)(0x1.2b0000p+0f, 0x1.dbc4c8p-14f),
+    (float2)(0x1.2b8000p+0f, 0x1.141a2ap-14f),
+    (float2)(0x1.2c0000p+0f, 0x1.93e44cp-17f),
+    (float2)(0x1.2c4000p+0f, 0x1.e6e432p-11f),
+    (float2)(0x1.2cc000p+0f, 0x1.c447c6p-11f),
+    (float2)(0x1.2d4000p+0f, 0x1.9e80d8p-11f),
+    (float2)(0x1.2dc000p+0f, 0x1.7595dcp-11f),
+    (float2)(0x1.2e4000p+0f, 0x1.498d30p-11f),
+    (float2)(0x1.2ec000p+0f, 0x1.1a6d1ep-11f),
+    (float2)(0x1.2f4000p+0f, 0x1.d077bap-12f),
+    (float2)(0x1.2fc000p+0f, 0x1.65ff1ep-12f),
+    (float2)(0x1.304000p+0f, 0x1.eaf912p-13f),
+    (float2)(0x1.30c000p+0f, 0x1.fbefb8p-14f),
+    (float2)(0x1.314000p+0f, 0x1.44905ap-19f),
+    (float2)(0x1.318000p+0f, 0x1.c017e6p-11f),
+    (float2)(0x1.320000p+0f, 0x1.7bfdbep-11f),
+    (float2)(0x1.328000p+0f, 0x1.34fbc6p-11f),
+    (float2)(0x1.330000p+0f, 0x1.d62f48p-12f),
+    (float2)(0x1.338000p+0f, 0x1.3cadc6p-12f),
+    (float2)(0x1.340000p+0f, 0x1.3afc06p-13f),
+    (float2)(0x1.344000p+0f, 0x1.fc556ep-11f),
+    (float2)(0x1.34c000p+0f, 0x1.a71f84p-11f),
+    (float2)(0x1.354000p+0f, 0x1.4f2290p-11f),
+    (float2)(0x1.35c000p+0f, 0x1.e8c79cp-12f),
+    (float2)(0x1.364000p+0f, 0x1.2dd0d8p-12f),
+    (float2)(0x1.36c000p+0f, 0x1.b5ac2ep-14f),
+    (float2)(0x1.370000p+0f, 0x1.d3d02ap-11f),
+    (float2)(0x1.378000p+0f, 0x1.6e3d58p-11f),
+    (float2)(0x1.380000p+0f, 0x1.060200p-11f),
+    (float2)(0x1.388000p+0f, 0x1.364608p-12f),
+    (float2)(0x1.390000p+0f, 0x1.6d29b6p-14f),
+    (float2)(0x1.394000p+0f, 0x1.bd8d5ep-11f),
+    (float2)(0x1.39c000p+0f, 0x1.4ae030p-11f),
+    (float2)(0x1.3a4000p+0f, 0x1.ab44b2p-12f),
+    (float2)(0x1.3ac000p+0f, 0x1.7761cep-13f),
+    (float2)(0x1.3b0000p+0f, 0x1.e38710p-11f),
+    (float2)(0x1.3b8000p+0f, 0x1.66b2b0p-11f),
+    (float2)(0x1.3c0000p+0f, 0x1.cebf96p-12f),
+    (float2)(0x1.3c8000p+0f, 0x1.964b20p-13f),
+    (float2)(0x1.3cc000p+0f, 0x1.e15004p-11f),
+    (float2)(0x1.3d4000p+0f, 0x1.5a9bcep-11f),
+    (float2)(0x1.3dc000p+0f, 0x1.a2f4d8p-12f),
+    (float2)(0x1.3e4000p+0f, 0x1.17c056p-13f),
+    (float2)(0x1.3e8000p+0f, 0x1.b800f8p-11f),
+    (float2)(0x1.3f0000p+0f, 0x1.27b132p-11f),
+    (float2)(0x1.3f8000p+0f, 0x1.2a09b8p-12f),
+    (float2)(0x1.400000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.404000p+0f, 0x1.68a69cp-11f),
+    (float2)(0x1.40c000p+0f, 0x1.9df950p-12f),
+    (float2)(0x1.414000p+0f, 0x1.983050p-14f),
+    (float2)(0x1.418000p+0f, 0x1.94c6a4p-11f),
+    (float2)(0x1.420000p+0f, 0x1.e88494p-12f),
+    (float2)(0x1.428000p+0f, 0x1.45f31ap-13f),
+)
+

diff --git a/amd-builtins/math32/ceilF.cl b/amd-builtins/math32/ceilF.cl
new file mode 100644
index 0000000..137ebe1
--- /dev/null
+++ b/amd-builtins/math32/ceilF.cl

@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+ceil(float x)
+{
+    return __amdil_round_posinf_f32(x);
+}

diff --git a/amd-builtins/math32/copysignF.cl b/amd-builtins/math32/copysignF.cl
new file mode 100644
index 0000000..c60cbaf
--- /dev/null
+++ b/amd-builtins/math32/copysignF.cl

@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+// __hsail_ intrinsic which has no __amdil_ equivalent.
+extern __attribute__((pure)) float  __hsail_copysign_f32(float, float);
+
+__attribute__((overloadable, always_inline)) float
+copysign(float x, float y)
+{
+    return __hsail_copysign_f32(x, y);
+}
+

diff --git a/amd-builtins/math32/cosF.cl b/amd-builtins/math32/cosF.cl
new file mode 100644
index 0000000..f5431f0
--- /dev/null
+++ b/amd-builtins/math32/cosF.cl

@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if 1
+#include "math32.h"
+#include "remainderF_piby2.h"
+#include "sincosF_piby4.h"
+//#else
+//extern __attribute__((pure)) float __amdil_cos_f32(float);
+#endif
+
+__attribute__((overloadable, pure)) float
+cos(float x)
+{
+#if 1
+    int ix = as_int(x);
+    int ax = ix & 0x7fffffff;
+    float dx = as_float(ax);
+
+    float r0, r1;
+    int regn = argReductionS(&r0, &r1, dx);
+
+    float ss = -sinf_piby4_new(r0, r1);
+    float cc =  cosf_piby4_new(r0, r1);
+
+    float c =  (regn & 1) != 0 ? ss : cc;
+    c = as_float(as_int(c) ^ ((regn > 1) << 31));
+
+    c = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : c;
+
+    return c;
+#else
+  // TODO_HSA: Using native_cos for now.
+  return native_cos(x);
+#endif
+}
+

diff --git a/amd-builtins/math32/coshF.cl b/amd-builtins/math32/coshF.cl
new file mode 100644
index 0000000..244bae1
--- /dev/null
+++ b/amd-builtins/math32/coshF.cl

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+cosh(float x)
+{
+    // After dealing with special cases the computation is split into regions as follows.
+    // abs(x) >= max_cosh_arg:
+    // cosh(x) = sign(x)*Inf
+    // abs(x) >= small_threshold:
+    // cosh(x) = sign(x)*exp(abs(x))/2 computed using the
+    // splitexp and scaleDouble functions as for exp_amd().
+    // abs(x) < small_threshold:
+    // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    // cosh(x) is then z. 
+
+    // Tabulated values of sinh(i) and cosh(i) for i = 0,...,36.
+    USE_TABLE(float2, p_tbl, SINHCOSH_TBL);
+
+    const float max_cosh_arg = 0x1.65a9fap+6f;
+    const float small_threshold = 0x1.0a2b24p+3f;
+
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    float y = as_float(aux);
+
+    // Find the integer part y0 of y and the increment dy = y - y0. We then compute
+    // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
+    // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
+    // where sinh(y0) and cosh(y0) are tabulated above.
+
+    int ind = (int)y;
+    ind = (uint)ind > 36U ? 0 : ind;
+
+    float dy = y - ind;
+    float dy2 = dy * dy;
+
+    float sdy = mad(dy2,
+                    mad(dy2,
+                        mad(dy2,
+                            mad(dy2,
+                                mad(dy2,
+                                    mad(dy2, 0.7746188980094184251527126e-12f, 0.160576793121939886190847e-9f),
+                                    0.250521176994133472333666e-7f),
+                                0.275573191913636406057211e-5f),
+                            0.198412698413242405162014e-3f),
+                        0.833333333333329931873097e-2f),
+                    0.166666666666666667013899e0f);
+    sdy = mad(sdy, dy*dy2, dy);
+
+    float cdy = mad(dy2,
+                    mad(dy2,
+                        mad(dy2,
+                            mad(dy2,
+                                mad(dy2,
+                                    mad(dy2, 0.1163921388172173692062032e-10f, 0.208744349831471353536305e-8f),
+                                    0.275573350756016588011357e-6f),
+                                0.248015872460622433115785e-4f),
+                            0.138888888889814854814536e-2f),
+                        0.416666666666660876512776e-1f),
+                    0.500000000000000005911074e0f);
+    cdy = mad(cdy, dy2, 1.0f);
+
+    float2 tv = p_tbl[ind];
+    float z = mad(tv.s0, sdy, tv.s1 * cdy);
+
+    // When exp(-x) is insignificant compared to exp(x), return exp(x)/2
+    float t = exp(y - 0x1.62e500p-1f);
+    float zsmall = mad(0x1.a0210ep-18f, t, t);
+    z = y >= small_threshold ? zsmall : z;
+
+    // Corner cases
+    z = y >= max_cosh_arg ? as_float(PINFBITPATT_SP32) : z;
+    z = aux > PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : z;
+    z = aux < 0x38800000 ? 1.0f : z;
+
+    return z;
+}
+

diff --git a/amd-builtins/math32/cospiF.cl b/amd-builtins/math32/cospiF.cl
new file mode 100644
index 0000000..2ed79ab
--- /dev/null
+++ b/amd-builtins/math32/cospiF.cl

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+#include "sincospiF_piby4.h"
+
+__attribute__((overloadable)) float
+cospi(float x)
+{
+    const float pi = 3.1415926535897932F;
+
+    int ix = as_int(x) & 0x7fffffff; 
+    float ax = as_float(ix);
+    int iax = (int)ax;
+    float r = ax - iax;
+    int xodd = iax & 0x1 ? 0x80000000 : 0;
+
+    // Initialize with return for +-Inf and NaN
+    int ir = 0x7fc00000;
+
+    // 2^24 <= |x| < Inf, the result is always even integer
+    ir = ix < 0x7f800000 ? 0x3f800000 : ir;
+
+    // 2^23 <= |x| < 2^24, the result is always integer
+    ir = ix < 0x4b800000 ? xodd | 0x3f800000 : ir;
+
+    // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+    // r < 1.0
+    float a = 1.0f - r;
+    int e = 1;
+    int s = xodd ^ 0x80000000;
+
+    // r <= 0.75
+    int c = r <= 0.75f;
+    a = c ? r - 0.5f : a;
+    e = c ? 0 : e;
+
+    // r < 0.5
+    c = r < 0.5f;
+    a = c ? 0.5f - r : a;
+    s = c ? xodd : s;
+
+    // r <= 0.25
+    c = r <= 0.25f;
+    a = c ? r : a;
+    e = c ? 1 : e;
+
+    float2 t = sincosf_piby4(a * pi);
+    int jr = s ^ as_int(e ? t.hi : t.lo);
+
+    ir = ix < 0x4b000000 ? jr : ir;
+
+    return as_float(ir);
+}
+

diff --git a/amd-builtins/math32/erfF.cl b/amd-builtins/math32/erfF.cl
new file mode 100644
index 0000000..94e372b
--- /dev/null
+++ b/amd-builtins/math32/erfF.cl

@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+#if !defined(SUBNORMALS_SUPPORTED)
+#include "floattointconversion.h"
+#endif //SUBNORMALS_SUPPORTED
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+*/
+
+#define erx   8.4506291151e-01f        /* 0x3f58560b */
+
+// Coefficients for approximation to  erf on [00.84375]
+
+#define efx   1.2837916613e-01f        /* 0x3e0375d4 */
+#define efx8  1.0270333290e+00f        /* 0x3f8375d4 */
+
+#define pp0   1.2837916613e-01f        /* 0x3e0375d4 */
+#define pp1  -3.2504209876e-01f        /* 0xbea66beb */
+#define pp2  -2.8481749818e-02f        /* 0xbce9528f */
+#define pp3  -5.7702702470e-03f        /* 0xbbbd1489 */
+#define pp4  -2.3763017452e-05f        /* 0xb7c756b1 */
+#define qq1   3.9791721106e-01f        /* 0x3ecbbbce */
+#define qq2   6.5022252500e-02f        /* 0x3d852a63 */
+#define qq3   5.0813062117e-03f        /* 0x3ba68116 */
+#define qq4   1.3249473704e-04f        /* 0x390aee49 */
+#define qq5  -3.9602282413e-06f        /* 0xb684e21a */
+
+// Coefficients for approximation to  erf  in [0.843751.25]
+
+#define pa0  -2.3621185683e-03f        /* 0xbb1acdc6 */
+#define pa1   4.1485610604e-01f        /* 0x3ed46805 */
+#define pa2  -3.7220788002e-01f        /* 0xbebe9208 */
+#define pa3   3.1834661961e-01f        /* 0x3ea2fe54 */
+#define pa4  -1.1089469492e-01f        /* 0xbde31cc2 */
+#define pa5   3.5478305072e-02f        /* 0x3d1151b3 */
+#define pa6  -2.1663755178e-03f        /* 0xbb0df9c0 */
+#define qa1   1.0642088205e-01f        /* 0x3dd9f331 */
+#define qa2   5.4039794207e-01f        /* 0x3f0a5785 */
+#define qa3   7.1828655899e-02f        /* 0x3d931ae7 */
+#define qa4   1.2617121637e-01f        /* 0x3e013307 */
+#define qa5   1.3637083583e-02f        /* 0x3c5f6e13 */
+#define qa6   1.1984500103e-02f        /* 0x3c445aa3 */
+
+// Coefficients for approximation to  erfc in [1.251/0.35]
+
+#define ra0  -9.8649440333e-03f        /* 0xbc21a093 */
+#define ra1  -6.9385856390e-01f        /* 0xbf31a0b7 */
+#define ra2  -1.0558626175e+01f        /* 0xc128f022 */
+#define ra3  -6.2375331879e+01f        /* 0xc2798057 */
+#define ra4  -1.6239666748e+02f        /* 0xc322658c */
+#define ra5  -1.8460508728e+02f        /* 0xc3389ae7 */
+#define ra6  -8.1287437439e+01f        /* 0xc2a2932b */
+#define ra7  -9.8143291473e+00f        /* 0xc11d077e */
+#define sa1   1.9651271820e+01f        /* 0x419d35ce */
+#define sa2   1.3765776062e+02f        /* 0x4309a863 */
+#define sa3   4.3456588745e+02f        /* 0x43d9486f */
+#define sa4   6.4538726807e+02f        /* 0x442158c9 */
+#define sa5   4.2900814819e+02f        /* 0x43d6810b */
+#define sa6   1.0863500214e+02f        /* 0x42d9451f */
+#define sa7   6.5702495575e+00f        /* 0x40d23f7c */
+#define sa8  -6.0424413532e-02f        /* 0xbd777f97 */
+
+// Coefficients for approximation to  erfc in [1/.3528]
+
+#define rb0  -9.8649431020e-03f        /* 0xbc21a092 */
+#define rb1  -7.9928326607e-01f        /* 0xbf4c9dd4 */
+#define rb2  -1.7757955551e+01f        /* 0xc18e104b */
+#define rb3  -1.6063638306e+02f        /* 0xc320a2ea */
+#define rb4  -6.3756646729e+02f        /* 0xc41f6441 */
+#define rb5  -1.0250950928e+03f        /* 0xc480230b */
+#define rb6  -4.8351919556e+02f        /* 0xc3f1c275 */
+#define sb1   3.0338060379e+01f        /* 0x41f2b459 */
+#define sb2   3.2579251099e+02f        /* 0x43a2e571 */
+#define sb3   1.5367296143e+03f        /* 0x44c01759 */
+#define sb4   3.1998581543e+03f        /* 0x4547fdbb */
+#define sb5   2.5530502930e+03f        /* 0x451f90ce */
+#define sb6   4.7452853394e+02f        /* 0x43ed43a7 */
+#define sb7  -2.2440952301e+01f        /* 0xc1b38712 */
+
+__attribute__((overloadable)) float
+erf(float x)
+{
+
+    int hx = as_uint(x);
+    int ix = hx & 0x7fffffff;
+    float absx = as_float(ix);
+
+    float x2 = absx * absx;
+    float t = 1.0f / x2;
+    float tt = absx - 1.0f;
+    t = absx < 1.25f ? tt : t;
+    t = absx < 0.84375f ? x2 : t;
+
+    float u, v, tu, tv;
+
+    // |x| < 6
+    u = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+    v = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sb7, sb6), sb5), sb4), sb3), sb2), sb1);
+
+    tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+    tv = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1);
+    u = absx < 0x1.6db6dcp+1f ? tu : u;
+    v = absx < 0x1.6db6dcp+1f ? tv : v;
+
+    tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+    tv = mad(t, mad(t, mad(t, mad(t, mad(t, qa6, qa5), qa4), qa3), qa2), qa1);
+    u = absx < 1.25f ? tu : u;
+    v = absx < 1.25f ? tv : v;
+
+    tu = mad(t, mad(t, mad(t, mad(t, pp4, pp3), pp2), pp1), pp0);
+    tv = mad(t, mad(t, mad(t, mad(t, qq5, qq4), qq3), qq2), qq1);
+    u = absx < 0.84375f ? tu : u;
+    v = absx < 0.84375f ? tv : v;
+
+    v = mad(t, v, 1.0f);
+    float q = MATH_DIVIDE(u, v);
+
+    float ret = 1.0f;
+
+    // |x| < 6
+    float z = as_float(ix & 0xfffff000);
+    float r = exp(mad(-z, z, -0.5625f)) * exp(mad(z-absx, z+absx, q));
+    r = 1.0f - MATH_DIVIDE(r,  absx);
+    ret = absx < 6.0f ? r : ret;
+
+    r = erx + q;
+    ret = absx < 1.25f ? r : ret;
+
+    ret = as_float((hx & 0x80000000) | as_int(ret));
+
+    r = mad(x, q, x);
+    ret = absx < 0.84375f ? r : ret;
+
+    // Prevent underflow
+    r = 0.125f * mad(8.0f, x, efx8 * x);
+    ret = absx < 0x1.0p-28f ? r : ret;
+
+	#if !defined(SUBNORMALS_SUPPORTED)
+
+		double dx =  float_uint_to_double(hx);
+		const  double sqt4overpi = 1.1283791670955125738961589031215;
+		float ret1 = as_float(double_to_float_uint(sqt4overpi * dx));
+		int c = as_uint(absx) == 0;
+		float ret2 = hx == 0 ? 0 : -0;
+		ret1 = c ? ret2 : ret1;
+		ret = x == 0. ? ret1 : ret;
+	#endif //SUBNORMALS_SUPPORTED
+
+
+    ret = isnan(x) ? x : ret;
+
+    return ret;
+}
+

diff --git a/amd-builtins/math32/erfcF.cl b/amd-builtins/math32/erfcF.cl
new file mode 100644
index 0000000..3081785
--- /dev/null
+++ b/amd-builtins/math32/erfcF.cl

@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+*/
+
+#define erx   8.4506291151e-01f        /* 0x3f58560b */
+
+// Coefficients for approximation to  erf on [00.84375]
+
+#define efx   1.2837916613e-01f        /* 0x3e0375d4 */
+#define efx8  1.0270333290e+00f        /* 0x3f8375d4 */
+
+#define pp0   1.2837916613e-01f        /* 0x3e0375d4 */
+#define pp1  -3.2504209876e-01f        /* 0xbea66beb */
+#define pp2  -2.8481749818e-02f        /* 0xbce9528f */
+#define pp3  -5.7702702470e-03f        /* 0xbbbd1489 */
+#define pp4  -2.3763017452e-05f        /* 0xb7c756b1 */
+#define qq1   3.9791721106e-01f        /* 0x3ecbbbce */
+#define qq2   6.5022252500e-02f        /* 0x3d852a63 */
+#define qq3   5.0813062117e-03f        /* 0x3ba68116 */
+#define qq4   1.3249473704e-04f        /* 0x390aee49 */
+#define qq5  -3.9602282413e-06f        /* 0xb684e21a */
+
+// Coefficients for approximation to  erf  in [0.843751.25]
+
+#define pa0  -2.3621185683e-03f        /* 0xbb1acdc6 */
+#define pa1   4.1485610604e-01f        /* 0x3ed46805 */
+#define pa2  -3.7220788002e-01f        /* 0xbebe9208 */
+#define pa3   3.1834661961e-01f        /* 0x3ea2fe54 */
+#define pa4  -1.1089469492e-01f        /* 0xbde31cc2 */
+#define pa5   3.5478305072e-02f        /* 0x3d1151b3 */
+#define pa6  -2.1663755178e-03f        /* 0xbb0df9c0 */
+#define qa1   1.0642088205e-01f        /* 0x3dd9f331 */
+#define qa2   5.4039794207e-01f        /* 0x3f0a5785 */
+#define qa3   7.1828655899e-02f        /* 0x3d931ae7 */
+#define qa4   1.2617121637e-01f        /* 0x3e013307 */
+#define qa5   1.3637083583e-02f        /* 0x3c5f6e13 */
+#define qa6   1.1984500103e-02f        /* 0x3c445aa3 */
+
+// Coefficients for approximation to  erfc in [1.251/0.35]
+
+#define ra0  -9.8649440333e-03f        /* 0xbc21a093 */
+#define ra1  -6.9385856390e-01f        /* 0xbf31a0b7 */
+#define ra2  -1.0558626175e+01f        /* 0xc128f022 */
+#define ra3  -6.2375331879e+01f        /* 0xc2798057 */
+#define ra4  -1.6239666748e+02f        /* 0xc322658c */
+#define ra5  -1.8460508728e+02f        /* 0xc3389ae7 */
+#define ra6  -8.1287437439e+01f        /* 0xc2a2932b */
+#define ra7  -9.8143291473e+00f        /* 0xc11d077e */
+#define sa1   1.9651271820e+01f        /* 0x419d35ce */
+#define sa2   1.3765776062e+02f        /* 0x4309a863 */
+#define sa3   4.3456588745e+02f        /* 0x43d9486f */
+#define sa4   6.4538726807e+02f        /* 0x442158c9 */
+#define sa5   4.2900814819e+02f        /* 0x43d6810b */
+#define sa6   1.0863500214e+02f        /* 0x42d9451f */
+#define sa7   6.5702495575e+00f        /* 0x40d23f7c */
+#define sa8  -6.0424413532e-02f        /* 0xbd777f97 */
+
+// Coefficients for approximation to  erfc in [1/.3528]
+
+#define rb0  -9.8649431020e-03f        /* 0xbc21a092 */
+#define rb1  -7.9928326607e-01f        /* 0xbf4c9dd4 */
+#define rb2  -1.7757955551e+01f        /* 0xc18e104b */
+#define rb3  -1.6063638306e+02f        /* 0xc320a2ea */
+#define rb4  -6.3756646729e+02f        /* 0xc41f6441 */
+#define rb5  -1.0250950928e+03f        /* 0xc480230b */
+#define rb6  -4.8351919556e+02f        /* 0xc3f1c275 */
+#define sb1   3.0338060379e+01f        /* 0x41f2b459 */
+#define sb2   3.2579251099e+02f        /* 0x43a2e571 */
+#define sb3   1.5367296143e+03f        /* 0x44c01759 */
+#define sb4   3.1998581543e+03f        /* 0x4547fdbb */
+#define sb5   2.5530502930e+03f        /* 0x451f90ce */
+#define sb6   4.7452853394e+02f        /* 0x43ed43a7 */
+#define sb7  -2.2440952301e+01f        /* 0xc1b38712 */
+
+__attribute__((overloadable)) float
+erfc(float x)
+{
+    int hx = as_int(x);
+    int ix = hx & 0x7fffffff;
+    float absx = as_float(ix);
+
+    // Argument for polys
+    float x2 = absx * absx;
+    float t = 1.0f / x2;
+    float tt = absx - 1.0f;
+    t = absx < 1.25f ? tt : t;
+    t = absx < 0.84375f ? x2 : t;
+
+    // Evaluate polys
+    float tu, tv, u, v;
+
+    u = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+    v = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sb7, sb6), sb5), sb4), sb3), sb2), sb1);
+
+    tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+    tv = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1);
+    u = absx < 0x1.6db6dap+1f ? tu : u;
+    v = absx < 0x1.6db6dap+1f ? tv : v;
+
+    tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+    tv = mad(t, mad(t, mad(t, mad(t, mad(t, qa6, qa5), qa4), qa3), qa2), qa1);
+    u = absx < 1.25f ? tu : u;
+    v = absx < 1.25f ? tv : v;
+
+    tu = mad(t, mad(t, mad(t, mad(t, pp4, pp3), pp2), pp1), pp0);
+    tv = mad(t, mad(t, mad(t, mad(t, qq5, qq4), qq3), qq2), qq1);
+    u = absx < 0.84375f ? tu : u;
+    v = absx < 0.84375f ? tv : v;
+
+    v = mad(t, v, 1.0f);
+
+    float q = MATH_DIVIDE(u, v);
+
+    float ret = 0.0f;
+
+    float z = as_float(ix & 0xfffff000);
+    float r = exp(mad(-z, z, -0.5625f)) * exp(mad(z - absx, z + absx, q));
+    r = MATH_DIVIDE(r, absx);
+    t = 2.0f - r;
+    r = x < 0.0f ? t : r;
+    ret = absx < 28.0f ? r : ret;
+
+    r = 1.0f - erx - q;
+    t = erx + q + 1.0f;
+    r = x < 0.0f ? t : r;
+    ret = absx < 1.25f ? r : ret;
+
+    r = 0.5f - mad(x, q, x - 0.5f);
+    ret = absx < 0.84375f ? r : ret;
+
+    ret = x < -6.0f ? 2.0f : ret;
+
+    ret = isnan(x) ? x : ret;
+
+    return ret;
+}
+

diff --git a/amd-builtins/math32/exp10F.cl b/amd-builtins/math32/exp10F.cl
new file mode 100644
index 0000000..3541a68
--- /dev/null
+++ b/amd-builtins/math32/exp10F.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_EXP10
+#include "expF_base.h"
+

diff --git a/amd-builtins/math32/exp2F.cl b/amd-builtins/math32/exp2F.cl
new file mode 100644
index 0000000..5086eb7
--- /dev/null
+++ b/amd-builtins/math32/exp2F.cl

@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if 0
+
+#define COMPILING_EXP2
+#include "expF_base.h"
+
+#else
+
+#include "math32.h"
+
+__attribute__((overloadable, weak)) float
+exp2(float x)
+{
+    // Reduce x
+    const float ln2HI = 0x1.62e300p-1f;
+    const float ln2LO = 0x1.2fefa2p-17f;
+
+    float t = rint(x);
+    int p = (int)t;
+    float tt = x - t;
+    float hi = tt * ln2HI;
+    float lo = tt * ln2LO;
+
+    // Evaluate poly
+    t = hi + lo;
+    tt  = t*t;
+    float v = mad(tt,
+                  -mad(tt,
+		       mad(tt,
+		           mad(tt,
+			       mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f),
+                               0x1.1566aap-14f),
+                           -0x1.6c16c2p-9f),
+                       0x1.555556p-3f),
+                  t);
+
+    float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi);
+
+    // Scale by 2^p
+    float r =  as_float(as_int(y) + (p << 23));
+
+    const float ulim =  128.0f;
+    const float llim = -126.0f;
+
+    r = x < llim ? 0.0f : r;
+    r = x < ulim ? r : as_float(0x7f800000);
+    return isnan(x) ? x : r;
+}
+
+#endif

diff --git a/amd-builtins/math32/expF.cl b/amd-builtins/math32/expF.cl
new file mode 100644
index 0000000..0975dc4
--- /dev/null
+++ b/amd-builtins/math32/expF.cl

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if 0
+
+#define COMPILING_EXP
+#include "expF_base.h"
+
+#else
+
+#include "math32.h"
+
+__attribute__((overloadable, weak)) float
+exp(float x)
+{
+    // Reduce x
+    const float ln2HI = 0x1.62e300p-1f;
+    const float ln2LO = 0x1.2fefa2p-17f;
+    const float invln2 = 0x1.715476p+0f;
+
+    float fhalF = x < 0.0f ? -0.5f : 0.5f;
+    int p  = mad(x, invln2, fhalF);
+    float fp = (float)p;
+    float hi = mad(fp, -ln2HI, x); // t*ln2HI is exact here
+    float lo = -fp*ln2LO;
+
+    // Evaluate poly
+    float t = hi + lo;
+    float tt  = t*t;
+    float v = mad(tt,
+                  -mad(tt,
+                       mad(tt,
+                           mad(tt,
+                               mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f),
+                               0x1.1566aap-14f),
+                           -0x1.6c16c2p-9f),
+                       0x1.555556p-3f),
+                  t);
+
+    float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi);
+
+    // Scale by 2^p
+    float r =  as_float(as_int(y) + (p << 23));
+
+    const float ulim =  0x1.62e430p+6f; // ln(largest_normal) = 88.72283905206835305366
+    const float llim = -0x1.5d589ep+6f; // ln(smallest_normal) = -87.33654475055310898657
+
+    r = x < llim ? 0.0f : r;
+    r = x < ulim ? r : as_float(0x7f800000);
+    return isnan(x) ? x : r;
+}
+
+#endif
+

diff --git a/amd-builtins/math32/expF_base.h b/amd-builtins/math32/expF_base.h
new file mode 100644
index 0000000..8cf9ad4
--- /dev/null
+++ b/amd-builtins/math32/expF_base.h

@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+//    Algorithm:
+//
+//    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
+//
+//    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
+//    n = 64*m + j,   0 <= j < 64
+//
+//    e^x = 2^((64*m + j + f)/64)
+//        = (2^m) * (2^(j/64)) * 2^(f/64)
+//        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
+//
+//    f = x*(64/ln(2)) - n
+//    r = f*(ln(2)/64) = x - n*(ln(2)/64)
+//
+//    e^x = (2^m) * (2^(j/64)) * e^r
+//
+//    (2^(j/64)) is precomputed
+//
+//    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
+//    e^r = 1 + q
+//
+//    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
+//
+//    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
+
+__attribute__((overloadable, weak)) float
+#if defined(COMPILING_EXP2)
+exp2(float x)
+#elif defined(COMPILING_EXP10)
+exp10(float x)
+#else
+exp(float x)
+#endif
+{
+    USE_TABLE(float, p_tbl, EXP_TBL);
+
+#if defined(COMPILING_EXP2)
+    const float X_MAX =  0x1.fffffep+6f; // 128
+    const float X_MIN = -0x1.2a0000p+7f; // -149
+#elif defined(COMPILING_EXP10)
+    const float X_MAX =  0x1.344134p+5f; // 128*log2/log10 : 38.53183944498959
+    const float X_MIN = -0x1.66d3e8p+5f; // -149*log2/log10 : -44.8534693539332
+#else
+    const float X_MAX =  0x1.62e42ep+6f; // 128*log2 : 88.722839111673
+    const float X_MIN = -0x1.9d1da0p+6f; // -149*log2 : -103.27892990343184
+#endif
+
+#if defined(COMPILING_EXP2)
+    const float R_64 = 0x1.000000p+6f; // 2^6
+    const float R_1_BY_64 = 0x1.000000p-6f; // 2^-6
+    const float R_LN2 = 0x1.62e430p-1f; // 0.6931471805599453
+#elif defined(COMPILING_EXP10)
+    const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f; // 64*log10/log2 : 212.6033980727912
+    const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f; // log2/(64 * log10) lead : 0.004699707
+    const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f; // log2/(64 * log10) tail : 0.00000388665057
+    const float R_LN10 = 0x1.26bb1cp+1f;
+#else
+    const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
+    const float R_LOG2_BY_64_LD = 0x1.620000p-7f; /* log2/64 lead: 0.0108032227 */
+    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; /* log2/64 tail: 0.0000272020388 */
+#endif
+
+    int return_nan = isnan(x);
+    int return_inf = x > X_MAX;
+    int return_zero = x < X_MIN;
+
+#if defined(COMPILING_EXP2)
+    int n = convert_int(x * R_64);
+#elif defined(COMPILING_EXP10)
+    int n = convert_int(x * R_64_BY_LOG10_2);
+#else
+    int n = convert_int(x * R_64_BY_LOG2);
+#endif
+
+    float fn = (float)n;
+    int j = n & 0x3f;
+    int m = n >> 6;
+    int m2 = m << EXPSHIFTBITS_SP32;
+    float r;
+
+#if defined(COMPILING_EXP2)
+    r = R_LN2 * mad(-R_1_BY_64, fn, x);
+#elif defined(COMPILING_EXP10)
+    r = R_LN10 * mad(fn, -R_LOG10_2_BY_64_TL, mad(fn, -R_LOG10_2_BY_64_LD, x));
+#else
+    r = mad(fn, -R_LOG2_BY_64_TL, mad(fn, -R_LOG2_BY_64_LD, x));
+#endif
+
+    // Truncated Taylor series for e^r
+    float z2 = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
+
+    float two_to_jby64 = p_tbl[j];
+    z2 = mad(two_to_jby64, z2, two_to_jby64);
+
+    float z2s = z2 * as_float(0x1 << (m + 149));
+    float z2n = as_float(as_int(z2) + m2);
+    z2 = m <= -126 ? z2s : z2n;
+
+
+    z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
+    z2 = return_zero ? 0.0f : z2;
+    z2 = return_nan ? x : z2;
+    return z2;
+}
+

diff --git a/amd-builtins/math32/expF_table.h b/amd-builtins/math32/expF_table.h
new file mode 100644
index 0000000..3d6759f
--- /dev/null
+++ b/amd-builtins/math32/expF_table.h

@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+DECLARE_TABLE(float, EXP_TBL, 65,
+    0x1.000000p+0f,
+    0x1.02c9a4p+0f,
+    0x1.059b0ep+0f,
+    0x1.087452p+0f,
+    0x1.0b5586p+0f,
+    0x1.0e3ec4p+0f,
+    0x1.11301ep+0f,
+    0x1.1429aap+0f,
+    0x1.172b84p+0f,
+    0x1.1a35bep+0f,
+    0x1.1d4874p+0f,
+    0x1.2063b8p+0f,
+    0x1.2387a6p+0f,
+    0x1.26b456p+0f,
+    0x1.29e9e0p+0f,
+    0x1.2d285ap+0f,
+    0x1.306fe0p+0f,
+    0x1.33c08cp+0f,
+    0x1.371a74p+0f,
+    0x1.3a7db4p+0f,
+    0x1.3dea64p+0f,
+    0x1.4160a2p+0f,
+    0x1.44e086p+0f,
+    0x1.486a2cp+0f,
+    0x1.4bfdaep+0f,
+    0x1.4f9b28p+0f,
+    0x1.5342b6p+0f,
+    0x1.56f474p+0f,
+    0x1.5ab07ep+0f,
+    0x1.5e76f2p+0f,
+    0x1.6247ecp+0f,
+    0x1.662388p+0f,
+    0x1.6a09e6p+0f,
+    0x1.6dfb24p+0f,
+    0x1.71f75ep+0f,
+    0x1.75feb6p+0f,
+    0x1.7a1148p+0f,
+    0x1.7e2f34p+0f,
+    0x1.82589ap+0f,
+    0x1.868d9ap+0f,
+    0x1.8ace54p+0f,
+    0x1.8f1aeap+0f,
+    0x1.93737cp+0f,
+    0x1.97d82ap+0f,
+    0x1.9c4918p+0f,
+    0x1.a0c668p+0f,
+    0x1.a5503cp+0f,
+    0x1.a9e6b6p+0f,
+    0x1.ae89fap+0f,
+    0x1.b33a2cp+0f,
+    0x1.b7f770p+0f,
+    0x1.bcc1eap+0f,
+    0x1.c199bep+0f,
+    0x1.c67f12p+0f,
+    0x1.cb720ep+0f,
+    0x1.d072d4p+0f,
+    0x1.d5818ep+0f,
+    0x1.da9e60p+0f,
+    0x1.dfc974p+0f,
+    0x1.e502eep+0f,
+    0x1.ea4afap+0f,
+    0x1.efa1bep+0f,
+    0x1.f50766p+0f,
+    0x1.fa7c18p+0f,
+    0x1.000000p+1f,
+)
+
+DECLARE_TABLE(float2, EXP_TBL_EP, 65,
+    (float2)(0x1.000000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.02c000p+0f, 0x1.347ceep-13f),
+    (float2)(0x1.058000p+0f, 0x1.b0d314p-12f),
+    (float2)(0x1.084000p+0f, 0x1.a28c3ap-11f),
+    (float2)(0x1.0b4000p+0f, 0x1.586cf8p-12f),
+    (float2)(0x1.0e0000p+0f, 0x1.f61968p-11f),
+    (float2)(0x1.110000p+0f, 0x1.80e808p-11f),
+    (float2)(0x1.140000p+0f, 0x1.4d5754p-11f),
+    (float2)(0x1.170000p+0f, 0x1.5c1e3ep-11f),
+    (float2)(0x1.1a0000p+0f, 0x1.adf5b6p-11f),
+    (float2)(0x1.1d4000p+0f, 0x1.0e62d0p-13f),
+    (float2)(0x1.204000p+0f, 0x1.1dc430p-11f),
+    (float2)(0x1.238000p+0f, 0x1.e9b9d4p-14f),
+    (float2)(0x1.268000p+0f, 0x1.a2b2f0p-11f),
+    (float2)(0x1.29c000p+0f, 0x1.4efa8ep-11f),
+    (float2)(0x1.2d0000p+0f, 0x1.42d372p-11f),
+    (float2)(0x1.304000p+0f, 0x1.7f0518p-11f),
+    (float2)(0x1.33c000p+0f, 0x1.164c82p-17f),
+    (float2)(0x1.370000p+0f, 0x1.a7373ap-12f),
+    (float2)(0x1.3a4000p+0f, 0x1.ed9a72p-11f),
+    (float2)(0x1.3dc000p+0f, 0x1.532608p-11f),
+    (float2)(0x1.414000p+0f, 0x1.0510fap-11f),
+    (float2)(0x1.44c000p+0f, 0x1.043030p-11f),
+    (float2)(0x1.484000p+0f, 0x1.515ae0p-11f),
+    (float2)(0x1.4bc000p+0f, 0x1.ed6a9ap-11f),
+    (float2)(0x1.4f8000p+0f, 0x1.b2769cp-12f),
+    (float2)(0x1.534000p+0f, 0x1.5ab4eap-15f),
+    (float2)(0x1.56c000p+0f, 0x1.a39b5ap-11f),
+    (float2)(0x1.5a8000p+0f, 0x1.83eea4p-11f),
+    (float2)(0x1.5e4000p+0f, 0x1.b78ad6p-11f),
+    (float2)(0x1.624000p+0f, 0x1.fac0e8p-14f),
+    (float2)(0x1.660000p+0f, 0x1.1c412ap-11f),
+    (float2)(0x1.6a0000p+0f, 0x1.3cccfep-13f),
+    (float2)(0x1.6dc000p+0f, 0x1.d91e32p-11f),
+    (float2)(0x1.71c000p+0f, 0x1.baf476p-11f),
+    (float2)(0x1.75c000p+0f, 0x1.f5ab20p-11f),
+    (float2)(0x1.7a0000p+0f, 0x1.1473eap-12f),
+    (float2)(0x1.7e0000p+0f, 0x1.799b66p-11f),
+    (float2)(0x1.824000p+0f, 0x1.89994cp-12f),
+    (float2)(0x1.868000p+0f, 0x1.b33688p-13f),
+    (float2)(0x1.8ac000p+0f, 0x1.ca8454p-13f),
+    (float2)(0x1.8f0000p+0f, 0x1.ae9914p-12f),
+    (float2)(0x1.934000p+0f, 0x1.9bd866p-11f),
+    (float2)(0x1.97c000p+0f, 0x1.829fdep-12f),
+    (float2)(0x1.9c4000p+0f, 0x1.230546p-13f),
+    (float2)(0x1.a0c000p+0f, 0x1.99ed76p-14f),
+    (float2)(0x1.a54000p+0f, 0x1.03b23ep-12f),
+    (float2)(0x1.a9c000p+0f, 0x1.35aabcp-11f),
+    (float2)(0x1.ae8000p+0f, 0x1.3f32b4p-13f),
+    (float2)(0x1.b30000p+0f, 0x1.d15c26p-11f),
+    (float2)(0x1.b7c000p+0f, 0x1.bb797cp-11f),
+    (float2)(0x1.bcc000p+0f, 0x1.e904bcp-16f),
+    (float2)(0x1.c18000p+0f, 0x1.9bdd84p-12f),
+    (float2)(0x1.c64000p+0f, 0x1.f8972ap-11f),
+    (float2)(0x1.cb4000p+0f, 0x1.906e76p-11f),
+    (float2)(0x1.d04000p+0f, 0x1.96a502p-11f),
+    (float2)(0x1.d58000p+0f, 0x1.8dcfbap-16f),
+    (float2)(0x1.da8000p+0f, 0x1.e603dap-12f),
+    (float2)(0x1.dfc000p+0f, 0x1.2e66f6p-13f),
+    (float2)(0x1.e50000p+0f, 0x1.773c58p-15f),
+    (float2)(0x1.ea4000p+0f, 0x1.5f4548p-13f),
+    (float2)(0x1.ef8000p+0f, 0x1.0df730p-11f),
+    (float2)(0x1.f50000p+0f, 0x1.d96db8p-14f),
+    (float2)(0x1.fa4000p+0f, 0x1.e0c0cep-11f),
+    (float2)(0x1.000000p+1f, 0x0.000000p+0f),
+)
+

diff --git a/amd-builtins/math32/expm1F.cl b/amd-builtins/math32/expm1F.cl
new file mode 100644
index 0000000..1584280
--- /dev/null
+++ b/amd-builtins/math32/expm1F.cl

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+/* Refer exp routine for algorithm */
+
+__attribute__((overloadable)) float
+expm1(float x)
+{
+    USE_TABLE(float2, p_tbl, EXP_TBL_EP);
+
+    const float X_MAX = 0x1.62e42ep+6f;  // 128*log2 : 88.722839111673
+    const float X_MIN = -0x1.9d1da0p+6f; // -149*log2 : -103.27892990343184
+
+    const float R_64_BY_LOG2 = 0x1.715476p+6f;     // 64/log2 : 92.332482616893657
+    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
+    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
+
+    uint xi = as_uint(x);
+    int n = (int)(x * R_64_BY_LOG2);
+    float fn = (float)n;
+
+    int j = n & 0x3f;
+    int m = n >> 6;
+
+    float r = mad(fn, -R_LOG2_BY_64_TL, mad(fn, -R_LOG2_BY_64_LD, x));
+
+    // Truncated Taylor series
+    float z2 = mad(r*r, mad(r, mad(r, 0x1.555556p-5f,  0x1.555556p-3f), 0.5f), r);
+
+    float m2 = as_float((m + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
+    float2 tv = p_tbl[j];
+    float two_to_jby64_h = tv.s0 * m2;
+    float two_to_jby64_t = tv.s1 * m2;
+    float two_to_jby64 = two_to_jby64_h + two_to_jby64_t;
+
+    z2 = mad(z2, two_to_jby64, two_to_jby64_t) + (two_to_jby64_h - 1.0f);
+	//Make subnormals work
+	z2 = x == 0. ? x : z2;
+    z2 = x < X_MIN | m < -24 ? -1.0f : z2;
+    z2 = x > X_MAX ? as_float(PINFBITPATT_SP32) : z2;
+    z2 = isnan(x) ? x : z2;
+
+    return z2;
+}
+

diff --git a/amd-builtins/math32/fabsF.cl b/amd-builtins/math32/fabsF.cl
new file mode 100644
index 0000000..f1bdaad
--- /dev/null
+++ b/amd-builtins/math32/fabsF.cl

@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+fabs(float x)
+{
+    return __amdil_fabs_f32(x);
+}
+

diff --git a/amd-builtins/math32/fdimF.cl b/amd-builtins/math32/fdimF.cl
new file mode 100644
index 0000000..960857f
--- /dev/null
+++ b/amd-builtins/math32/fdimF.cl

@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+fdim(float x, float y)
+{
+    int n = -(isnan(x) | isnan(y)) & QNANBITPATT_SP32;
+    int r = -(x > y) & as_int(x - y);
+    return as_float(n | r);
+}
+
+__attribute__((overloadable, always_inline)) float4
+fdim(float4 x, float4 y)
+{
+    int4 n = ~((x == x) & (y == y)) & QNANBITPATT_SP32;
+    int4 r = (x > y) & as_int4(x - y);
+    return as_float4(n | r);
+}
+

diff --git a/amd-builtins/math32/floorF.cl b/amd-builtins/math32/floorF.cl
new file mode 100644
index 0000000..38490f0
--- /dev/null
+++ b/amd-builtins/math32/floorF.cl

@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+floor(float x)
+{
+    return __amdil_round_neginf_f32(x);
+}
+

diff --git a/amd-builtins/math32/fmaF.cl b/amd-builtins/math32/fmaF.cl
new file mode 100644
index 0000000..053a356
--- /dev/null
+++ b/amd-builtins/math32/fmaF.cl

@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float4
+fma(float4 a, float4 b, float4 c)
+{
+    float4 ret;
+    ret.lo = fma(a.lo, b.lo, c.lo);
+    ret.hi = fma(a.hi, b.hi, c.hi);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float2
+fma(float2 a, float2 b, float2 c)
+{
+    float2 ret;
+    ret.lo = fma(a.lo, b.lo, c.lo);
+    ret.hi = fma(a.hi, b.hi, c.hi);
+    return ret;
+}
+
+#define CM(C, B, A) as_float(__amdil_cmov_logical_i32(as_uint(C), as_uint(B), as_uint(A)))
+
+__attribute__((overloadable, always_inline)) float
+fma(float a, float b, float c)
+{
+    if (HAVE_HW_FMA32()) {
+	return __amdil_fma_f32(a, b, c);
+    } else {
+        float z3 = mad(a, b, c);
+        float cs = c;
+    
+        int ae = as_int(a) >> 23;
+        int be = as_int(b) >> 23;
+        int ce = as_int(c) >> 23;
+    
+        ae &= 0xff;
+        be &= 0xff;
+        ce &= 0xff;
+    
+        ae -= 127;
+        be -= 127;
+        ce -= 127;
+    
+        int pe = ae + be;
+    
+        int cen = ce - pe;
+        cen += 127;
+        cen <<= 23;
+    
+        // special cases flag
+        int spclal = ae == -127;
+        int spclbl = be == -127;
+        int spclcl = ce == -127;
+    
+        int spclah = ae == 128;
+        int spclbh = be == 128;
+        int spclch = ce == 128;
+    
+        spclal |= spclah;
+        spclbl |= spclbh;
+        spclcl |= spclch;
+    
+        int spcl = spclal | spclbl;
+        spcl |= spclcl;
+    
+        int spcl2 = spclah | spclbh;
+        spcl2 = ~spcl2;
+        spcl2 &= spclch;
+    
+        // Normalize
+        int an = as_int(a) & 0x807fffff;
+        int bn = as_int(b) & 0x807fffff;
+        int cn = as_int(c) & 0x807fffff;
+    
+        an |= 0x3f800000;
+        bn |= 0x3f800000;
+        cn |= cen;
+    
+        a = as_float(an);
+        b = as_float(bn);
+        c = as_float(cn);
+    
+        // Get head & tail parts of a, b
+        float ah = as_float(an & 0xfffff000);
+        float bh = as_float(bn & 0xfffff000);
+    
+        float at = a - ah;
+        float bt = b - bh;
+    
+        // Get head & tail parts of the product a*b
+        float p = a * b;
+        float pt = mad(ah, bh, -p);
+        pt = mad(ah, bt, pt);
+        pt = mad(at, bh, pt);
+        pt = mad(at, bt, pt);
+    
+        // carefully add p and c; these steps valid only when pe and ce are not far apart
+        float rr = p + c;
+        float t1 = p - rr;
+        t1 += c;
+        float t2 = c - rr;
+        t2 += p;
+        int pick1 = as_int(p) & 0x7fffffff;
+        int pick2 = as_int(c) & 0x7fffffff;
+        int pick = pick1 > pick2;
+        float t = CM(pick, t1, t2);
+    
+        float vv = t + pt;
+        float ww1 = t - vv;
+        ww1 += pt;
+        float ww2 = pt - vv;
+        ww2 += t;
+        pick1 = as_int(t) & 0x7fffffff;
+        pick2 = as_int(pt) & 0x7fffffff;
+        pick = pick1 > pick2;
+        float ww = CM(pick, ww1, ww2);
+    
+        // pick r,v,w based on how far apart pe and ce are
+        // number 60 is safe; actual value close to 24+24+2
+        pick1 = pe - ce;
+        pick = pick1 < 60;
+        float r = CM(pick, rr, p);
+        float v = CM(pick, vv, pt);
+        float w = CM(pick, ww, cs);
+    
+        // identify if there was a rounding issue, and so correction is needed
+        int rndc1 = as_int(r) & 0x7f800000;
+        int rndc2 = as_int(v) & 0x7f800000;
+        int rndc = rndc1 - rndc2;
+        rndc = rndc == 0x0c000000;
+        rndc1 = as_int(v) & 0x007fffff;
+        rndc1 = rndc1 == 0;
+        rndc2 = as_int(w) & 0x7fffffff;
+        rndc2 = rndc2 != 0;
+        rndc &= rndc1;
+        rndc &= rndc2;
+    
+        int ws = as_int(w) & 0x80000000;
+        int ve = as_int(v) & 0x7f800000;
+        ve -= 0x0b800000;
+        w = as_float(ws | ve);
+    
+        float vw = v + w;
+        v = CM(rndc, vw, v);
+        float z = r + v;
+    
+        // reconstruct return value
+        int ze = as_int(z) >> 23;
+        ze &= 0xff;
+        ze -= 127;
+        ze += pe;
+        ze += 127;
+    
+        int z1e = ze & 0xff;
+        z1e <<= 23;
+        int z1 = as_int(z) & 0x807fffff;
+        z1 |= z1e;
+    
+        pick1 = as_int(z) & 0x7fffffff;
+        pick = pick1 == 0;
+        z = CM(pick, z, z1);
+    
+        int z2 = as_int(z) & 0x80000000;
+        pick = ze <= 0;
+        z = CM(pick, z2, z);
+        z2 |=  0x7f800000;
+        pick = ze > 254;
+        z = CM(pick, z2, z);
+    
+        pick1 = ce - pe;
+        pick = pick1 > 30;
+        z = CM(pick, cs, z);
+        z = CM(spcl, z3, z);
+        z = CM(spcl2, cs, z);
+        return z;
+    }
+}
+

diff --git a/amd-builtins/math32/fmaxF.cl b/amd-builtins/math32/fmaxF.cl
new file mode 100644
index 0000000..f8b48a6
--- /dev/null
+++ b/amd-builtins/math32/fmaxF.cl

@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+extern __attribute__((pure)) float __hsail_max_f32(float,float);
+
+__attribute__ ((overloadable, always_inline)) float
+fmax(float x, float y)
+{
+    return __hsail_max_f32(x, y);
+}

diff --git a/amd-builtins/math32/fminF.cl b/amd-builtins/math32/fminF.cl
new file mode 100644
index 0000000..f78ad5b
--- /dev/null
+++ b/amd-builtins/math32/fminF.cl

@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+extern __attribute__((pure)) float __hsail_min_f32(float,float);
+
+__attribute__((overloadable, always_inline)) float
+fmin(float x, float y)
+{
+    // The adds here force subnormal values to zero
+    return __hsail_min_f32(x, y);
+}

diff --git a/amd-builtins/math32/fmodF.cl b/amd-builtins/math32/fmodF.cl
new file mode 100644
index 0000000..3fb552c
--- /dev/null
+++ b/amd-builtins/math32/fmodF.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_FMOD
+
+#include "remainderF.h"

diff --git a/amd-builtins/math32/fractF.cl b/amd-builtins/math32/fractF.cl
new file mode 100644
index 0000000..55dc3fc
--- /dev/null
+++ b/amd-builtins/math32/fractF.cl

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+fract (float x, float *ip)
+{
+    float i = __amdil_round_neginf_f32(x);
+    float r = x - i;
+    r = __amdil_min_f32(r, 0x1.fffffep-1f);
+    r = isinf(x) ? 0.0f : r;
+    r = isnan(x) ? x : r;
+    *ip = i;
+    return  r;
+}
+
+#if __OPENCL_C_VERSION__ < 200
+
+__attribute__((overloadable, always_inline)) float
+fract(float x, __local float *ip)
+{
+    float i;
+    float f = fract(x, &i);
+    *ip = i;
+    return f;
+}
+
+__attribute__((overloadable, always_inline)) float
+fract(float x, __global float *ip)
+{
+    float i;
+    float f = fract(x, &i);
+    *ip = i;
+    return f;
+}
+
+#endif

diff --git a/amd-builtins/math32/frexpF.cl b/amd-builtins/math32/frexpF.cl
new file mode 100644
index 0000000..75d41c6
--- /dev/null
+++ b/amd-builtins/math32/frexpF.cl

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline, weak)) float
+frexp(float x, int *ep)
+{
+    int i = as_int(x);
+    int ai = i & 0x7fffffff;
+    int d = ai > 0 & ai < 0x00800000;
+    // scale subnormal by 2^26 without multiplying
+    float s = as_float(ai | 0x0d800000) - 0x1.0p-100F;
+    ai = d ? as_int(s) : ai;
+    int e = (ai >> 23) - 126 - (d ? 26 : 0);
+    int t = ai == 0 | e == 129;
+    i = (i & 0x80000000) | 0x3f000000 | (ai & 0x007fffff);
+    *ep = t ? 0 : e;
+    return t ? x : as_float(i);
+}
+
+#if __OPENCL_C_VERSION__ < 200
+
+__attribute__((overloadable, always_inline, weak)) float
+frexp(float x, __local int *ep)
+{
+    int e;
+    float f = frexp(x, &e);
+    *ep = e;
+    return f;
+}
+
+__attribute__((overloadable, always_inline, weak)) float
+frexp(float x, __global int *ep)
+{
+    int e;
+    float f = frexp(x, &e);
+    *ep = e;
+    return f;
+}
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float4
+frexp(float4 x, int4 *ep)
+{
+    int4 i = as_int4(x);
+    int4 ai = i & 0x7fffffff;
+    int4 d = ai > 0 & ai < 0x00800000;
+    float4 s = as_float4(ai | 0x0d800000) - 0x1.0p-100F;
+    ai = bitselect(ai, as_int4(s), d);
+    int4 e = (ai >> 23) - 126 - bitselect((int4)0, (int4)26, d);
+    int4 t = ai == (int4)0 | e == (int4)129;
+    i = (i & (int4)0x80000000) | (int4)0x3f000000 | (ai & 0x007fffff);
+    *ep = bitselect(e, (int4)0, t);
+    return bitselect(as_float4(i), x, as_float4(t));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float4
+frexp(float4 x, __global int4 *ep)
+{
+    int4 e;
+    float4 ret = frexp(x, &e);
+    *ep = e;
+    return ret;
+}
+
+__attribute__((overloadable, always_inline, weak)) float4
+frexp(float4 x, __local int4 *ep)
+{
+    int4 e;
+    float4 ret = frexp(x, &e);
+    *ep = e;
+    return ret;
+}
+#endif

diff --git a/amd-builtins/math32/half_cosF.cl b/amd-builtins/math32/half_cosF.cl
new file mode 100644
index 0000000..d8c7b9c
--- /dev/null
+++ b/amd-builtins/math32/half_cosF.cl

@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+#include "remainderF_piby2.h"
+#include "sincosF_piby4.h"
+
+__attribute__((overloadable, always_inline, weak)) float
+half_cos(float x)
+{
+    int ix = as_int(x);
+    int ax = ix & 0x7fffffff;
+    float dx = as_float(ax);
+
+    float r0, r1;
+    int regn = argReductionSmallS(&r0, &r1, dx);
+
+    float ss = -sinf_piby4_new(r0, r1);
+    float cc =  cosf_piby4_new(r0, r1);
+    float c = (regn & 1) != 0 ? ss : cc;
+    c = as_float(as_int(c) ^ ((regn > 1) <<31));
+
+    c = ax > 0x47800000 ? 1.0f : c;
+    c = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : c;
+    return c;
+}
+

diff --git a/amd-builtins/math32/half_divideF.cl b/amd-builtins/math32/half_divideF.cl
new file mode 100644
index 0000000..b3e3d8b
--- /dev/null
+++ b/amd-builtins/math32/half_divideF.cl

@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline, weak)) float
+half_divide(float x, float y)
+{
+    int c = fabs(y) > 0x1.0p+96f;
+    float s = c ? 0x1.0p-32f : 1.0f;
+    y *= s;
+    return s * native_divide(x, y);
+}
+
+//__attribute__((overloadable, always_inline)) float2
+//half_divide(float2 x, float2 y)
+//{
+//    int2 c = fabs(y) > 0x1.0p+96f;
+//    float2 s = c ? 0x1.0p-32f : 1.0f;
+//    y *= s;
+//    return s * native_divide(x, y);
+//}
+//
+//__attribute__((overloadable, always_inline)) float4
+//half_divide(float4 x, float4 y)
+//{
+//    int4 c = fabs(y) > 0x1.0p+96f;
+//    float4 s = c ? 0x1.0p-32f : 1.0f;
+//    y *= s;
+//    return s * native_divide(x, y);
+//}
+

diff --git a/amd-builtins/math32/half_expF.cl b/amd-builtins/math32/half_expF.cl
new file mode 100644
index 0000000..bd21b76
--- /dev/null
+++ b/amd-builtins/math32/half_expF.cl

@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline, weak)) float
+half_exp(float x)
+{
+    return native_exp(x);
+}
+
+//__attribute__((overloadable, always_inline)) float2
+//half_exp(float2 x)
+//{
+//    return native_exp(x);
+//}
+//
+//__attribute__((overloadable, always_inline)) float4
+//half_exp(float4 x)
+//{
+//    return native_exp(x);
+//}
+
+__attribute__((overloadable, always_inline, weak)) float
+half_exp2(float x)
+{
+    return native_exp2(x);
+}
+
+//__attribute__((overloadable, always_inline)) float2
+//half_exp2(float2 x)
+//{
+//    return native_exp2(x);
+//}
+//
+//__attribute__((overloadable, always_inline)) float4
+//half_exp2(float4 x)
+//{
+//    return native_exp2(x);
+//}
+
+__attribute__((overloadable, always_inline, weak)) float
+half_exp10(float x)
+{
+    return native_exp10(x);
+}
+
+//__attribute__((overloadable, always_inline)) float2
+//half_exp10(float2 x)
+//{
+//    return native_exp10(x);
+//}
+//
+//__attribute__((overloadable, always_inline)) float4
+//half_exp10(float4 x)
+//{
+//    return native_exp10(x);
+//}
+

diff --git a/amd-builtins/math32/half_logF.cl b/amd-builtins/math32/half_logF.cl
new file mode 100644
index 0000000..fad736a
--- /dev/null
+++ b/amd-builtins/math32/half_logF.cl

@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline, weak)) float
+half_log2(float x)
+{
+    return native_log2(x);
+}
+
+__attribute__((overloadable, always_inline, weak)) float
+half_log10(float x)
+{
+    return native_log10(x);
+}
+
+__attribute__((overloadable, always_inline, weak)) float
+half_log(float x)
+{
+    return native_log(x);
+}

diff --git a/amd-builtins/math32/half_powrF.cl b/amd-builtins/math32/half_powrF.cl
new file mode 100644
index 0000000..978cdeb
--- /dev/null
+++ b/amd-builtins/math32/half_powrF.cl

@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+half_powr(float x, float y)
+{
+    return powr(x, y);
+}
+
+//__attribute__((overloadable, always_inline)) float4
+//half_powr(float4 x, float4 y)
+//{
+//    return powr(x, y);
+//}
+

diff --git a/amd-builtins/math32/half_recipF.cl b/amd-builtins/math32/half_recipF.cl
new file mode 100644
index 0000000..43af15f
--- /dev/null
+++ b/amd-builtins/math32/half_recipF.cl

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+extern __attribute__((pure)) float __amdil_div_f32(float,float);
+//extern __attribute__((pure)) float2 __amdil_div_v2f32(float2,float2);
+//extern __attribute__((pure)) float4 __amdil_div_v4f32(float4,float4);
+
+__attribute__((overloadable, always_inline)) float
+half_recip(float x)
+{
+    return __amdil_div_f32(1.0f, x);
+}
+
+//__attribute__((overloadable, always_inline)) float2
+//half_recip(float2 x)
+//{
+//    return __amdil_div_v2f32((float2)1.0f, x);
+//}
+//
+//__attribute__((overloadable, always_inline)) float3
+//half_recip(float3 x)
+//{
+//    float3 ret;
+//    ret.s01 =  __amdil_div_v2f32((float2)1.0f, x.s01);
+//    ret.s2 = __amdil_div_f32(1.0f, x.s2);
+//    return ret;
+//}
+//
+//__attribute__((overloadable, always_inline)) float4
+//half_recip(float4 x)
+//{
+//    return __amdil_div_v4f32((float4)1.0f, x);
+//}
+

diff --git a/amd-builtins/math32/half_rsqrtF.cl b/amd-builtins/math32/half_rsqrtF.cl
new file mode 100644
index 0000000..8268f72
--- /dev/null
+++ b/amd-builtins/math32/half_rsqrtF.cl

@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline, weak)) float
+half_rsqrt(float x)
+{
+    return native_rsqrt(x);
+}

diff --git a/amd-builtins/math32/half_sinF.cl b/amd-builtins/math32/half_sinF.cl
new file mode 100644
index 0000000..b1c7201
--- /dev/null
+++ b/amd-builtins/math32/half_sinF.cl

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+#include "remainderF_piby2.h"
+#include "sincosF_piby4.h"
+
+__attribute__((overloadable, weak)) float
+half_sin(float x)
+{
+    int ix = as_int(x);
+    int ax = ix & 0x7fffffff;
+
+    float dx = as_float(ax);
+
+    float r0, r1;
+    int regn = argReductionSmallS(&r0, &r1, dx);
+
+    float ss = sinf_piby4_new(r0, r1);
+    float cc = cosf_piby4_new(r0, r1);
+    float s = (regn & 1) != 0 ? cc : ss;
+    s = as_float(as_int(s) ^ ((regn > 1) << 31));
+
+    s = ax > 0x47800000 ? 1.0f : s;
+    s = as_float(as_int(s) ^ (ix ^ ax));
+    s = x == 0.0f ? x : s;
+    s = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : s;
+    return s;
+}
+

diff --git a/amd-builtins/math32/half_sincos.cl b/amd-builtins/math32/half_sincos.cl
new file mode 100644
index 0000000..31f63e4
--- /dev/null
+++ b/amd-builtins/math32/half_sincos.cl

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+#include "remainderF_piby2.h"
+#include "sincosF_piby4.h"
+
+// Calculate half_sin and half_cos at once
+
+__attribute__((always_inline, weak)) float
+__hsa_half_sincos(float x, float *cp)
+{
+    int ix = as_int(x);
+    int ax = ix & 0x7fffffff;
+    float dx = as_float(ax);
+
+    float r0, r1;
+    int regn = argReductionSmallS(&r0, &r1, dx);
+
+    float ss = sinf_piby4_new(r0, r1);
+    float cc = cosf_piby4_new(r0, r1);
+    bool reg0 = (regn & 1) != 0;
+    float c = reg0 ? -ss : cc;
+    float s = reg0 ? cc : ss;
+    int xsign = ((regn > 1) << 31);
+    c = as_float(as_int(c) ^ xsign);
+    s = as_float(as_int(s) ^ xsign);
+
+    bool is_huge = ax > 0x47800000;
+    c = is_huge ? 1.0f : c;
+    s = is_huge ? 1.0f : s;
+    s = as_float(as_int(s) ^ (ix ^ ax));
+    s = x == 0.0f ? x : s;
+    bool is_inf = ax >= PINFBITPATT_SP32;
+    c = is_inf ? as_float(QNANBITPATT_SP32) : c;
+    s = is_inf ? as_float(QNANBITPATT_SP32) : s;
+    *cp = c;
+    return s;
+}

diff --git a/amd-builtins/math32/half_sqrtF.cl b/amd-builtins/math32/half_sqrtF.cl
new file mode 100644
index 0000000..4e3f937
--- /dev/null
+++ b/amd-builtins/math32/half_sqrtF.cl

@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline, weak)) float
+half_sqrt(float x)
+{
+    return native_sqrt(x);
+}

diff --git a/amd-builtins/math32/half_tanF.cl b/amd-builtins/math32/half_tanF.cl
new file mode 100644
index 0000000..1b20ba6
--- /dev/null
+++ b/amd-builtins/math32/half_tanF.cl

@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+#include "remainderF_piby2.h"
+#include "tanF_piby4.h"
+
+__attribute__((overloadable)) float
+half_tan(float x)
+{
+    int ix = as_int(x);
+    int ax = ix & 0x7fffffff;
+    float dx = as_float(ax);
+
+    float r0, r1;
+    int regn = argReductionSmallS(&r0, &r1, dx);
+
+    float t = tanf_piby4_new(r0, regn);
+    t = ix != ax ? -t : t;
+    t = x == 0.0f ? x : t;
+    t = ax > 0x47800000 ? 0.0f : t;
+    t = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : t;
+    return t;
+}
+

diff --git a/amd-builtins/math32/hypotF.cl b/amd-builtins/math32/hypotF.cl
new file mode 100644
index 0000000..36f2931
--- /dev/null
+++ b/amd-builtins/math32/hypotF.cl

@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+#if !defined(SUBNORMALS_SUPPORTED)
+#include "floattointconversion.h"
+#endif //SUBNORMALS_SUPPORTED
+
+// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result warrants it
+
+
+
+
+__attribute__((overloadable, always_inline)) float
+hypot(float x, float y)
+{
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint uy = as_uint(y);
+    uint auy = uy & EXSIGNBIT_SP32;
+	float retval;
+    int c = aux > auy;
+    ux = c ? aux : auy;
+    uy = c ? auy : aux;
+
+#if !defined(SUBNORMALS_SUPPORTED)
+	if( as_float(uy) > 0.0 && ux < 0x7effffff)
+	{
+#endif	//SUBNORMALS_SUPPORTED
+		int xexp = clamp((int)(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126);
+		float fx_exp = as_float((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
+		float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
+		float fx = as_float(ux) * fi_exp;
+		float fy = as_float(uy) * fi_exp;
+		retval = MATH_SQRT(mad(fx, fx, fy*fy)) * fx_exp;
+#if !defined(SUBNORMALS_SUPPORTED)
+	}
+	else
+	{
+		double dy = float_uint_to_double(as_uint(y));
+		double dx = float_uint_to_double(as_uint(x));
+		double dretval = sqrt(dx*dx + dy*dy);
+		retval = as_float(double_to_float_uint(dretval));
+	}
+#endif //SUBNORMALS_SUPPORTED
+
+    retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
+    retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32 ? as_float(PINFBITPATT_SP32) : retval;
+    return retval;
+}

diff --git a/amd-builtins/math32/ilogbF.cl b/amd-builtins/math32/ilogbF.cl
new file mode 100644
index 0000000..e2a737c
--- /dev/null
+++ b/amd-builtins/math32/ilogbF.cl

@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) int
+ilogb(float x)
+{
+    uint ux = as_uint(x);
+    uint ax = ux & EXSIGNBIT_SP32;
+    int rs = -118 - (int)clz(ux & MANTBITS_SP32);
+    int r = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+    r = ax < 0x00800000U ? rs : r;
+    r = ax > EXPBITS_SP32 | ax == 0 ? 0x80000000 : r;
+    r = ax == EXPBITS_SP32 ? 0x7fffffff : r;
+    return r;
+}
+

diff --git a/amd-builtins/math32/ldexpF.cl b/amd-builtins/math32/ldexpF.cl
new file mode 100644
index 0000000..f6c3492
--- /dev/null
+++ b/amd-builtins/math32/ldexpF.cl

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline, weak)) float
+ldexp(float x, int n)
+{
+	#if 0
+    // This treats subnormals as zeros
+    int i = as_int(x);
+    int e = (i >> 23) & 0xff;
+    int m = i & 0x007fffff;
+    int s = i & 0x80000000;
+    int v = add_sat(e, n);
+    v = clamp(v, 0, 0xff);
+    int mr = e == 0 | v == 0 | v == 0xff ? 0 : m;
+    int c = e == 0xff;
+    mr = c ? m : mr;
+    int er = c ? e : v;
+    er = e ? er : e;
+    return as_float( s | (er << 23) | mr );
+    #endif
+
+
+	/* supports denormal values */
+	const int multiplier = 24;
+	float val_f;
+	uint val_ui;
+	uint sign;
+	int exponent;
+	val_ui = as_uint(x);
+	sign = val_ui & 0x80000000;
+	val_ui = val_ui & 0x7fffffff;/* remove the sign bit */
+	int val_x = val_ui;
+
+	exponent = val_ui >> 23; /* get the exponent */
+	int dexp = exponent;
+
+
+	/* denormal support */
+	int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0)) >> 23);
+	int dexponent = 25 - fbh;
+	uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23));
+	int ex = dexponent + n - multiplier;
+	dexponent = ex;
+	uint val = sign | (ex << 23) | (dval_ui & 0x007fffff);
+	int ex1 = dexponent + multiplier;
+	ex1 = -ex1 +25;
+	dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1);
+	dval_ui = dexponent > 0 ? val :dval_ui;
+	dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui;  /*overflow*/
+	dval_ui = dexponent < -multiplier ? 0 : dval_ui;  /*underflow*/
+	dval_ui = dval_ui | sign;
+	val_f = as_float(dval_ui);
+
+	exponent += n;
+
+	val = sign | (exponent << 23) | (val_ui & 0x007fffff);
+	ex1 = exponent + multiplier;
+	ex1 = -ex1 +25;
+	val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1);
+	val_ui = exponent > 0 ? val :val_ui;
+	val_ui = exponent > 254 ? 0x7f800000 :val_ui;  /*overflow*/
+	val_ui = exponent < -multiplier ? 0 : val_ui;  /*underflow*/
+	val_ui = val_ui | sign;
+
+	val_ui = dexp == 0? dval_ui : val_ui;
+	val_f = as_float(val_ui);
+
+
+	val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f;
+	return val_f;
+
+
+}

diff --git a/amd-builtins/math32/lgammaF.cl b/amd-builtins/math32/lgammaF.cl
new file mode 100644
index 0000000..b7ecd4a
--- /dev/null
+++ b/amd-builtins/math32/lgammaF.cl

@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#define pi   3.1415927410e+00f        /* 0x40490fdb */
+
+#define a0   7.7215664089e-02f        /* 0x3d9e233f */
+#define a1   3.2246702909e-01f        /* 0x3ea51a66 */
+#define a2   6.7352302372e-02f        /* 0x3d89f001 */
+#define a3   2.0580807701e-02f        /* 0x3ca89915 */
+#define a4   7.3855509982e-03f        /* 0x3bf2027e */
+#define a5   2.8905137442e-03f        /* 0x3b3d6ec6 */
+#define a6   1.1927076848e-03f        /* 0x3a9c54a1 */
+#define a7   5.1006977446e-04f        /* 0x3a05b634 */
+#define a8   2.2086278477e-04f        /* 0x39679767 */
+#define a9   1.0801156895e-04f        /* 0x38e28445 */
+#define a10  2.5214456400e-05f        /* 0x37d383a2 */
+#define a11  4.4864096708e-05f        /* 0x383c2c75 */
+
+#define tc   1.4616321325e+00f        /* 0x3fbb16c3 */
+
+#define tf  -1.2148628384e-01f        /* 0xbdf8cdcd */
+/* tt -(tail of tf) */
+#define tt   6.6971006518e-09f        /* 0x31e61c52 */
+
+#define t0   4.8383611441e-01f        /* 0x3ef7b95e */
+#define t1  -1.4758771658e-01f        /* 0xbe17213c */
+#define t2   6.4624942839e-02f        /* 0x3d845a15 */
+#define t3  -3.2788541168e-02f        /* 0xbd064d47 */
+#define t4   1.7970675603e-02f        /* 0x3c93373d */
+#define t5  -1.0314224288e-02f        /* 0xbc28fcfe */
+#define t6   6.1005386524e-03f        /* 0x3bc7e707 */
+#define t7  -3.6845202558e-03f        /* 0xbb7177fe */
+#define t8   2.2596477065e-03f        /* 0x3b141699 */
+#define t9  -1.4034647029e-03f        /* 0xbab7f476 */
+#define t10  8.8108185446e-04f        /* 0x3a66f867 */
+#define t11 -5.3859531181e-04f        /* 0xba0d3085 */
+#define t12  3.1563205994e-04f        /* 0x39a57b6b */
+#define t13 -3.1275415677e-04f        /* 0xb9a3f927 */
+#define t14  3.3552918467e-04f        /* 0x39afe9f7 */
+
+#define u0  -7.7215664089e-02f        /* 0xbd9e233f */
+#define u1   6.3282704353e-01f        /* 0x3f2200f4 */
+#define u2   1.4549225569e+00f        /* 0x3fba3ae7 */
+#define u3   9.7771751881e-01f        /* 0x3f7a4bb2 */
+#define u4   2.2896373272e-01f        /* 0x3e6a7578 */
+#define u5   1.3381091878e-02f        /* 0x3c5b3c5e */
+
+#define v1   2.4559779167e+00f        /* 0x401d2ebe */
+#define v2   2.1284897327e+00f        /* 0x4008392d */
+#define v3   7.6928514242e-01f        /* 0x3f44efdf */
+#define v4   1.0422264785e-01f        /* 0x3dd572af */
+#define v5   3.2170924824e-03f        /* 0x3b52d5db */
+
+#define s0  -7.7215664089e-02f        /* 0xbd9e233f */
+#define s1   2.1498242021e-01f        /* 0x3e5c245a */
+#define s2   3.2577878237e-01f        /* 0x3ea6cc7a */
+#define s3   1.4635047317e-01f        /* 0x3e15dce6 */
+#define s4   2.6642270386e-02f        /* 0x3cda40e4 */
+#define s5   1.8402845599e-03f        /* 0x3af135b4 */
+#define s6   3.1947532989e-05f        /* 0x3805ff67 */
+
+#define r1   1.3920053244e+00f        /* 0x3fb22d3b */
+#define r2   7.2193557024e-01f        /* 0x3f38d0c5 */
+#define r3   1.7193385959e-01f        /* 0x3e300f6e */
+#define r4   1.8645919859e-02f        /* 0x3c98bf54 */
+#define r5   7.7794247773e-04f        /* 0x3a4beed6 */
+#define r6   7.3266842264e-06f        /* 0x36f5d7bd */
+
+#define w0   4.1893854737e-01f        /* 0x3ed67f1d */
+#define w1   8.3333335817e-02f        /* 0x3daaaaab */
+#define w2  -2.7777778450e-03f        /* 0xbb360b61 */
+#define w3   7.9365057172e-04f        /* 0x3a500cfd */
+#define w4  -5.9518753551e-04f        /* 0xba1c065c */
+#define w5   8.3633989561e-04f        /* 0x3a5b3dd2 */
+#define w6  -1.6309292987e-03f        /* 0xbad5c4e8 */
+
+__attribute__ ((overloadable, always_inline)) float
+lgamma_r(float x, int *signp)
+{
+    int hx = as_int(x);
+    int ix = hx & 0x7fffffff;
+    float absx = as_float(ix);
+
+    if (ix >= 0x7f800000) {
+	*signp = 1;
+        return x;
+    }
+
+    if (absx < 0x1.0p-70f) {
+        *signp = hx < 0 ? -1 : 1;
+	return -log(absx);
+    }
+
+    float r;
+
+    if (absx == 1.0f | absx == 2.0f)
+        r = 0.0f;
+
+    else if (absx < 2.0f) {
+	float y = 2.0f - absx;
+	int i = 0;
+
+	int c = absx < 0x1.bb4c30p+0f;
+	float yt = absx - tc;
+	y = c ? yt : y;
+	i = c ? 1 : i;
+
+	c = absx < 0x1.3b4c40p+0f;
+	yt = absx - 1.0f;
+	y = c ? yt : y;
+	i = c ? 2 : i;
+
+	r = -log(absx);
+	yt = 1.0f - absx;
+	c = absx <= 0x1.ccccccp-1f;
+	r = c ? r : 0.0f;
+	y = c ? yt : y;
+	i = c ? 0 : i;
+
+	c = absx < 0x1.769440p-1f;
+	yt = absx - (tc - 1.0f);
+	y = c ? yt : y;
+	i = c ? 1 : i;
+
+	c = absx < 0x1.da6610p-3f;
+	y = c ? absx : y;
+	i = c ? 2 : i;
+
+	float z, w, p1, p2, p3, p;
+        switch (i) {
+        case 0:
+            z = y * y;
+	    p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0);
+	    p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1);
+	    p = mad(y, p1, p2);
+	    r += mad(y, -0.5f, p);
+            break;
+        case 1:
+            z = y * y;
+            w = z * y;
+	    p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0);
+	    p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1);
+            p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2);
+            p = mad(z, p1, -mad(w, -mad(y, p3, p2), tt));
+            r += tf + p;
+            break;
+        case 2:
+            p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0);
+            p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), 1.0f);
+            r += mad(y, -0.5f, MATH_DIVIDE(p1, p2));
+	    break;
+        }
+    } else if (absx < 8.0f) {
+        int i = (int)absx;
+        float y = absx - (float) i;
+        float p = y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0);
+        float q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), 1.0f);
+        r = mad(y, 0.5f, MATH_DIVIDE(p, q));
+
+	float y6 = y + 6.0f;
+	float y5 = y + 5.0f;
+	float y4 = y + 4.0f;
+	float y3 = y + 3.0f;
+	float y2 = y + 2.0f;
+
+        float z = 1.0f;
+	z *= i > 6 ? y6 : 1.0f;
+	z *= i > 5 ? y5 : 1.0f;
+	z *= i > 4 ? y4 : 1.0f;
+	z *= i > 3 ? y3 : 1.0f;
+	z *= i > 2 ? y2 : 1.0f;
+
+        r += log(z);
+    } else if (absx < 0x1.0p+58f) {
+        float z = 1.0f / absx;
+        float y = z * z;
+        float w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0);
+	r = mad(absx - 0.5f, log(absx) - 1.0f, w);
+    } else
+        // 2**58 <= x <= Inf
+        r = absx * (log(absx) - 1.0f);
+
+    int s = 1;
+
+    if (x < 0.0f) {
+        float t = sinpi(x);
+        r = log(pi / fabs(t * x)) - r;
+	r = t == 0.0f ? as_float(PINFBITPATT_SP32) : r;
+	s = t < 0.0f ? -1 : s;
+    }
+
+    *signp = s;
+    return r;
+}
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__ ((overloadable, always_inline)) float
+lgamma_r(float x, __local int *signp)
+{
+    int s;
+    float l = lgamma_r(x, &s);
+    *signp = s;
+    return l;
+}
+
+__attribute__ ((overloadable, always_inline)) float
+lgamma_r(float x, __global int *signp)
+{
+    int s;
+    float l = lgamma_r(x, &s);
+    *signp = s;
+    return l;
+}
+#endif
+
+__attribute__ ((overloadable, always_inline)) float
+lgamma(float x)
+{
+    int s;
+    float l = lgamma_r(x, &s);
+    return l;
+}
+

diff --git a/amd-builtins/math32/log10F.cl b/amd-builtins/math32/log10F.cl
new file mode 100644
index 0000000..b418fea
--- /dev/null
+++ b/amd-builtins/math32/log10F.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_LOG10
+#include "logF_base.h"
+

diff --git a/amd-builtins/math32/log1pF.cl b/amd-builtins/math32/log1pF.cl
new file mode 100644
index 0000000..eb2b059
--- /dev/null
+++ b/amd-builtins/math32/log1pF.cl

@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+log1p(float x)
+{
+    USE_TABLE(float2, p_log, LOGE_TBL);
+    USE_TABLE(float, p_inv, LOG_INV_TBL);
+
+    float w = x;
+    uint ux = as_uint(x);
+    uint ax = ux & EXSIGNBIT_SP32;
+
+    // |x| < 2^-4
+    float u2 = MATH_DIVIDE(x, 2.0f + x);
+    float u = u2 + u2;
+    float v = u * u;
+    // 2/(5 * 2^5), 2/(3 * 2^3)
+    float zsmall = mad(-u2, x, mad(v, 0x1.99999ap-7f, 0x1.555556p-4f) * v * u) + x;
+
+    // |x| >= 2^-4
+    //x = x + 1.0f;
+    ux = as_uint(x + 1.0f);
+
+    int m = (int)((ux >> EXPSHIFTBITS_SP32) & 0xff) - EXPBIAS_SP32;
+    float mf = (float)m;
+    uint indx = (ux & 0x007f0000) + ((ux & 0x00008000) << 1);
+    float F = as_float(indx | 0x3f000000);
+
+    // x > 2^24
+    float fg24 = F - as_float(0x3f000000 | (ux & MANTBITS_SP32));
+
+    // x <= 2^24
+    uint xhi = ux & 0xffff8000;
+    float xh = as_float(xhi);
+    float xt = (1.0f - xh) + w;
+    uint xnm = ((~(xhi & 0x7f800000)) - 0x00800000) & 0x7f800000;
+    xt = xt * as_float(xnm) * 0.5f;
+    float fl24 = F - as_float(0x3f000000 | (xhi & MANTBITS_SP32)) - xt;
+
+    float f = mf > 24.0f ? fg24 : fl24;
+
+    indx = indx >> 16;
+    float r = f * p_inv[indx];
+
+    // 1/3, 1/2
+    float poly = mad(mad(r, 0x1.555556p-2f, 0x1.0p-1f), r*r, r);
+
+    const float LOG2_HEAD = 0x1.62e000p-1f;   // 0.693115234
+    const float LOG2_TAIL = 0x1.0bfbe8p-15f;  // 0.0000319461833
+
+    float2 tv = p_log[indx];
+    float z1 = mad(mf, LOG2_HEAD, tv.s0);
+    float z2 = mad(mf, LOG2_TAIL, -poly) + tv.s1;
+    float z = z1 + z2;
+
+    z = ax < 0x3d800000U ? zsmall : z;
+
+
+
+    // Edge cases
+    z = ax >= PINFBITPATT_SP32 ? w : z;
+    z = w  < -1.0f ? as_float(QNANBITPATT_SP32) : z;
+    z = w == -1.0f ? as_float(NINFBITPATT_SP32) : z;
+	//fix subnormals
+	z = ax  < 0x33800000 ? x : z;
+
+    return z;
+}
+

diff --git a/amd-builtins/math32/log2F.cl b/amd-builtins/math32/log2F.cl
new file mode 100644
index 0000000..a90e149
--- /dev/null
+++ b/amd-builtins/math32/log2F.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_LOG2
+#include "logF_base.h"
+

diff --git a/amd-builtins/math32/logF.cl b/amd-builtins/math32/logF.cl
new file mode 100644
index 0000000..79fb03e
--- /dev/null
+++ b/amd-builtins/math32/logF.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_LOG
+#include "logF_base.h"
+

diff --git a/amd-builtins/math32/logF_base.h b/amd-builtins/math32/logF_base.h
new file mode 100644
index 0000000..9482247
--- /dev/null
+++ b/amd-builtins/math32/logF_base.h

@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+/*
+   Algorithm:
+
+   Based on:
+   Ping-Tak Peter Tang
+   "Table-driven implementation of the logarithm function in IEEE
+   floating-point arithmetic"
+   ACM Transactions on Mathematical Software (TOMS)
+   Volume 16, Issue 4 (December 1990)
+
+
+   x very close to 1.0 is handled differently, for x everywhere else
+   a brief explanation is given below
+
+   x = (2^m)*A
+   x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8))
+   x = (2^m)*2*(G/2+g/2)
+   x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9))
+
+   Y = (2^(-1))*(2^(-m))*(2^m)*A
+   Now, range of Y is: 0.5 <= Y < 1
+
+   F = 0x80 + (first 7 mantissa bits) + (8th mantissa bit)
+   Now, range of F is: 128 <= F <= 256 
+   F = F / 256 
+   Now, range of F is: 0.5 <= F <= 1
+
+   f = -(Y-F), with (f <= 2^(-9))
+
+   log(x) = m*log(2) + log(2) + log(F-f)
+   log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
+   log(x) = m*log(2) + log(2*F) + log(1-r)
+
+   r = (f/F), with (r <= 2^(-8))
+   r = f*(1/F) with (1/F) precomputed to avoid division
+
+   log(x) = m*log(2) + log(G) - poly
+
+   log(G) is precomputed
+   poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5))
+
+   log(2) and log(G) need to be maintained in extra precision
+   to avoid losing precision in the calculations
+
+
+   For x close to 1.0, we employ the following technique to
+   ensure faster convergence.
+
+   log(x) = log((1+s)/(1-s)) = 2*s + (2/3)*s^3 + (2/5)*s^5 + (2/7)*s^7
+   x = ((1+s)/(1-s)) 
+   x = 1 + r
+   s = r/(2+r)
+
+*/
+
+__attribute__((overloadable, weak)) float
+#if defined(COMPILING_LOG2)
+log2(float x)
+#elif defined(COMPILING_LOG10)
+log10(float x)
+#else
+log(float x)
+#endif
+{
+    USE_TABLE(float, p_inv, LOG_INV_TBL);
+
+#if defined(COMPILING_LOG2)
+    USE_TABLE(float2, p_log, LOG2_TBL);
+    const float LOG2E = 0x1.715476p+0f;      // 1.4426950408889634
+    const float LOG2E_HEAD = 0x1.700000p+0f; // 1.4375
+    const float LOG2E_TAIL = 0x1.547652p-8f; // 0.00519504072
+#elif defined(COMPILING_LOG10)
+    USE_TABLE(float2, p_log, LOG10_TBL);
+    const float LOG10E = 0x1.bcb7b2p-2f;        // 0.43429448190325182
+    const float LOG10E_HEAD = 0x1.bc0000p-2f;   // 0.43359375
+    const float LOG10E_TAIL = 0x1.6f62a4p-11f;  // 0.0007007319
+    const float LOG10_2_HEAD = 0x1.340000p-2f;  // 0.30078125
+    const float LOG10_2_TAIL = 0x1.04d426p-12f; // 0.000248745637
+#else
+    USE_TABLE(float2, p_log, LOGE_TBL);
+    const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
+    const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+#endif
+
+    uint xi = as_uint(x);
+    uint ax = xi & EXSIGNBIT_SP32;
+
+    // Calculations for |x-1| < 2^-4
+    float r = x - 1.0f;
+    int near1 = fabs(r) < 0x1.0p-4f;
+    float u2 = MATH_DIVIDE(r, 2.0f + r);
+    float corr = u2 * r;
+    float u = u2 + u2;
+    float v = u * u;
+    float znear1, z1, z2;
+
+    // 2/(5 * 2^5), 2/(3 * 2^3)
+    z2 = mad(u, mad(v, 0x1.99999ap-7f, 0x1.555556p-4f)*v, -corr);
+
+#if defined(COMPILING_LOG2)
+    z1 = as_float(as_int(r) & 0xffff0000);
+    z2 = z2 + (r - z1);
+    znear1 = mad(z1, LOG2E_HEAD, mad(z2, LOG2E_HEAD, mad(z1, LOG2E_TAIL, z2*LOG2E_TAIL)));
+#elif defined(COMPILING_LOG10)
+    z1 = as_float(as_int(r) & 0xffff0000);
+    z2 = z2 + (r - z1);
+    znear1 = mad(z1, LOG10E_HEAD, mad(z2, LOG10E_HEAD, mad(z1, LOG10E_TAIL, z2*LOG10E_TAIL)));
+#else
+    znear1 = z2 + r;
+#endif
+
+    // Calculations for x not near 1
+    int m = (int)(xi >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+
+    // Normalize subnormal
+    uint xis = as_uint(as_float(xi | 0x3f800000) - 1.0f);
+    int ms = (int)(xis >> EXPSHIFTBITS_SP32) - 253;
+    int c = m == -127;
+    m = c ? ms : m;
+    uint xin = c ? xis : xi;
+
+    float mf = (float)m;
+    uint indx = (xin & 0x007f0000) + ((xin & 0x00008000) << 1);
+
+    // F - Y
+    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (xin & MANTBITS_SP32));
+
+    indx = indx >> 16;
+    r = f * p_inv[indx];
+
+    // 1/3,  1/2
+    float poly = mad(mad(r, 0x1.555556p-2f, 0.5f), r*r, r);
+
+    float2 tv = p_log[indx];
+
+#if defined(COMPILING_LOG2)
+    z1 = tv.s0 + mf;
+    z2 = mad(poly, -LOG2E, tv.s1);
+#elif defined(COMPILING_LOG10)
+    z1 = mad(mf, LOG10_2_HEAD, tv.s0);
+    z2 = mad(poly, -LOG10E, mf*LOG10_2_TAIL) + tv.s1;
+#else
+    z1 = mad(mf, LOG2_HEAD, tv.s0);
+    z2 = mad(mf, LOG2_TAIL, -poly) + tv.s1;
+#endif
+
+    float z = z1 + z2;
+    z = near1 ? znear1 : z;
+
+    // Corner cases
+    z = ax >= PINFBITPATT_SP32 ? x : z;
+    z = xi != ax ? as_float(QNANBITPATT_SP32) : z;
+    z = ax == 0 ? as_float(NINFBITPATT_SP32) : z;
+
+    return z;
+}
+

diff --git a/amd-builtins/math32/logF_table.h b/amd-builtins/math32/logF_table.h
new file mode 100644
index 0000000..4b52129
--- /dev/null
+++ b/amd-builtins/math32/logF_table.h

@@ -0,0 +1,682 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+DECLARE_TABLE(float2, LOG2_TBL, 129,
+    (float2)(0x0.000000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.6f8000p-7f, 0x1.942dbap-17f),
+    (float2)(0x1.6e0000p-6f, 0x1.e5a170p-16f),
+    (float2)(0x1.118000p-5f, 0x1.347544p-15f),
+    (float2)(0x1.6b8000p-5f, 0x1.69bac6p-16f),
+    (float2)(0x1.c48000p-5f, 0x1.7eae42p-15f),
+    (float2)(0x1.0e8000p-4f, 0x1.9c4fd0p-15f),
+    (float2)(0x1.3a8000p-4f, 0x1.17ee92p-15f),
+    (float2)(0x1.660000p-4f, 0x1.fb7d64p-15f),
+    (float2)(0x1.918000p-4f, 0x1.42dc8cp-17f),
+    (float2)(0x1.bc8000p-4f, 0x1.0902b6p-18f),
+    (float2)(0x1.e70000p-4f, 0x1.7608bep-15f),
+    (float2)(0x1.088000p-3f, 0x1.162336p-13f),
+    (float2)(0x1.1d8000p-3f, 0x1.3465d4p-13f),
+    (float2)(0x1.328000p-3f, 0x1.74f13cp-14f),
+    (float2)(0x1.470000p-3f, 0x1.aa7e60p-13f),
+    (float2)(0x1.5c0000p-3f, 0x1.a39fbcp-19f),
+    (float2)(0x1.700000p-3f, 0x1.d0b53ap-13f),
+    (float2)(0x1.848000p-3f, 0x1.0af40ap-13f),
+    (float2)(0x1.988000p-3f, 0x1.b741dep-13f),
+    (float2)(0x1.ac8000p-3f, 0x1.d78b6cp-13f),
+    (float2)(0x1.c08000p-3f, 0x1.6db376p-13f),
+    (float2)(0x1.d48000p-3f, 0x1.ee4c32p-15f),
+    (float2)(0x1.e80000p-3f, 0x1.02f9d2p-13f),
+    (float2)(0x1.fb8000p-3f, 0x1.05ae40p-13f),
+    (float2)(0x1.078000p-2f, 0x1.0adbb0p-14f),
+    (float2)(0x1.110000p-2f, 0x1.83ed68p-13f),
+    (float2)(0x1.1a8000p-2f, 0x1.016ca4p-12f),
+    (float2)(0x1.240000p-2f, 0x1.01eac2p-12f),
+    (float2)(0x1.2d8000p-2f, 0x1.887e26p-13f),
+    (float2)(0x1.370000p-2f, 0x1.24cea4p-14f),
+    (float2)(0x1.400000p-2f, 0x1.918ec6p-12f),
+    (float2)(0x1.498000p-2f, 0x1.3c25e6p-13f),
+    (float2)(0x1.528000p-2f, 0x1.6f7f12p-12f),
+    (float2)(0x1.5c0000p-2f, 0x1.a39fbcp-18f),
+    (float2)(0x1.650000p-2f, 0x1.8fe466p-14f),
+    (float2)(0x1.6e0000p-2f, 0x1.10e6cep-13f),
+    (float2)(0x1.770000p-2f, 0x1.d2ba7ep-14f),
+    (float2)(0x1.800000p-2f, 0x1.4ac62cp-15f),
+    (float2)(0x1.888000p-2f, 0x1.a71cb8p-12f),
+    (float2)(0x1.918000p-2f, 0x1.dd448ep-13f),
+    (float2)(0x1.9a8000p-2f, 0x1.1c8f10p-21f),
+    (float2)(0x1.a30000p-2f, 0x1.bb053ep-13f),
+    (float2)(0x1.ab8000p-2f, 0x1.861e5ep-12f),
+    (float2)(0x1.b40000p-2f, 0x1.fafdcep-12f),
+    (float2)(0x1.bd0000p-2f, 0x1.e5d3cep-15f),
+    (float2)(0x1.c58000p-2f, 0x1.2fad28p-14f),
+    (float2)(0x1.ce0000p-2f, 0x1.492474p-15f),
+    (float2)(0x1.d60000p-2f, 0x1.d4f80cp-12f),
+    (float2)(0x1.de8000p-2f, 0x1.4ff510p-12f),
+    (float2)(0x1.e70000p-2f, 0x1.3550f2p-13f),
+    (float2)(0x1.ef0000p-2f, 0x1.b59ccap-12f),
+    (float2)(0x1.f78000p-2f, 0x1.42b464p-13f),
+    (float2)(0x1.ff8000p-2f, 0x1.5e66a0p-12f),
+    (float2)(0x1.038000p-1f, 0x1.f6a2e4p-11f),
+    (float2)(0x1.080000p-1f, 0x1.39e4fep-14f),
+    (float2)(0x1.0c0000p-1f, 0x1.0500d6p-13f),
+    (float2)(0x1.100000p-1f, 0x1.13b152p-13f),
+    (float2)(0x1.140000p-1f, 0x1.93f542p-14f),
+    (float2)(0x1.180000p-1f, 0x1.467b94p-16f),
+    (float2)(0x1.1b8000p-1f, 0x1.cc47a4p-11f),
+    (float2)(0x1.1f8000p-1f, 0x1.78f4c2p-11f),
+    (float2)(0x1.238000p-1f, 0x1.107508p-11f),
+    (float2)(0x1.278000p-1f, 0x1.2602c2p-12f),
+    (float2)(0x1.2b8000p-1f, 0x1.a39fbcp-20f),
+    (float2)(0x1.2f0000p-1f, 0x1.5a1d7ap-11f),
+    (float2)(0x1.330000p-1f, 0x1.3e355ap-12f),
+    (float2)(0x1.368000p-1f, 0x1.cffedap-11f),
+    (float2)(0x1.3a8000p-1f, 0x1.d9fd50p-12f),
+    (float2)(0x1.3e0000p-1f, 0x1.f64de6p-11f),
+    (float2)(0x1.420000p-1f, 0x1.d83f4cp-12f),
+    (float2)(0x1.458000p-1f, 0x1.cea628p-11f),
+    (float2)(0x1.498000p-1f, 0x1.3c25e6p-12f),
+    (float2)(0x1.4d0000p-1f, 0x1.5a96ccp-11f),
+    (float2)(0x1.510000p-1f, 0x1.18708ap-17f),
+    (float2)(0x1.548000p-1f, 0x1.374652p-12f),
+    (float2)(0x1.580000p-1f, 0x1.2089a6p-11f),
+    (float2)(0x1.5b8000p-1f, 0x1.93432cp-11f),
+    (float2)(0x1.5f0000p-1f, 0x1.f3fd06p-11f),
+    (float2)(0x1.630000p-1f, 0x1.0b8f54p-13f),
+    (float2)(0x1.668000p-1f, 0x1.004722p-12f),
+    (float2)(0x1.6a0000p-1f, 0x1.57cf2cp-12f),
+    (float2)(0x1.6d8000p-1f, 0x1.8cb53ap-12f),
+    (float2)(0x1.710000p-1f, 0x1.9f4d8ap-12f),
+    (float2)(0x1.748000p-1f, 0x1.8feb26p-12f),
+    (float2)(0x1.780000p-1f, 0x1.5edfeep-12f),
+    (float2)(0x1.7b8000p-1f, 0x1.0c7c9ap-12f),
+    (float2)(0x1.7f0000p-1f, 0x1.322182p-13f),
+    (float2)(0x1.828000p-1f, 0x1.3ab7cep-18f),
+    (float2)(0x1.858000p-1f, 0x1.a82c2cp-11f),
+    (float2)(0x1.890000p-1f, 0x1.3dd2c0p-11f),
+    (float2)(0x1.8c8000p-1f, 0x1.871da4p-12f),
+    (float2)(0x1.900000p-1f, 0x1.cc2c00p-14f),
+    (float2)(0x1.930000p-1f, 0x1.9fdb68p-11f),
+    (float2)(0x1.968000p-1f, 0x1.ed6956p-12f),
+    (float2)(0x1.9a0000p-1f, 0x1.f1a760p-14f),
+    (float2)(0x1.9d0000p-1f, 0x1.767f54p-11f),
+    (float2)(0x1.a08000p-1f, 0x1.3f6d26p-12f),
+    (float2)(0x1.a38000p-1f, 0x1.b9fce2p-11f),
+    (float2)(0x1.a70000p-1f, 0x1.8ae816p-12f),
+    (float2)(0x1.aa0000p-1f, 0x1.c23d60p-11f),
+    (float2)(0x1.ad8000p-1f, 0x1.60f388p-12f),
+    (float2)(0x1.b08000p-1f, 0x1.9049aep-11f),
+    (float2)(0x1.b40000p-1f, 0x1.8734a8p-13f),
+    (float2)(0x1.b70000p-1f, 0x1.2523d4p-11f),
+    (float2)(0x1.ba0000p-1f, 0x1.da6ce6p-11f),
+    (float2)(0x1.bd8000p-1f, 0x1.038e62p-12f),
+    (float2)(0x1.c08000p-1f, 0x1.1b511ep-11f),
+    (float2)(0x1.c38000p-1f, 0x1.a728b8p-11f),
+    (float2)(0x1.c70000p-1f, 0x1.2b5d22p-14f),
+    (float2)(0x1.ca0000p-1f, 0x1.2c6e54p-12f),
+    (float2)(0x1.cd0000p-1f, 0x1.f35064p-12f),
+    (float2)(0x1.d00000p-1f, 0x1.4fdb48p-11f),
+    (float2)(0x1.d30000p-1f, 0x1.98ec9ep-11f),
+    (float2)(0x1.d60000p-1f, 0x1.d4f80cp-11f),
+    (float2)(0x1.d98000p-1f, 0x1.0643d6p-17f),
+    (float2)(0x1.dc8000p-1f, 0x1.33567ep-14f),
+    (float2)(0x1.df8000p-1f, 0x1.e0410cp-14f),
+    (float2)(0x1.e28000p-1f, 0x1.142e0ep-13f),
+    (float2)(0x1.e58000p-1f, 0x1.063c88p-13f),
+    (float2)(0x1.e88000p-1f, 0x1.8d66c4p-14f),
+    (float2)(0x1.eb8000p-1f, 0x1.57e32ap-15f),
+    (float2)(0x1.ee0000p-1f, 0x1.ed1c6cp-11f),
+    (float2)(0x1.f10000p-1f, 0x1.b8a076p-11f),
+    (float2)(0x1.f40000p-1f, 0x1.7822f2p-11f),
+    (float2)(0x1.f70000p-1f, 0x1.2bbc3ap-11f),
+    (float2)(0x1.fa0000p-1f, 0x1.a708bap-12f),
+    (float2)(0x1.fd0000p-1f, 0x1.be4c7ep-13f),
+    (float2)(0x1.000000p+0f, 0x0.000000p+0f),
+)
+
+DECLARE_TABLE(float2, LOG10_TBL, 129,
+    (float2)(0x0.000000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.ba8000p-9f, 0x1.f51c88p-19f),
+    (float2)(0x1.b90000p-8f, 0x1.1da93ep-18f),
+    (float2)(0x1.498000p-7f, 0x1.8428a2p-18f),
+    (float2)(0x1.b58000p-7f, 0x1.a423acp-17f),
+    (float2)(0x1.108000p-6f, 0x1.41d422p-17f),
+    (float2)(0x1.458000p-6f, 0x1.d3d6b2p-16f),
+    (float2)(0x1.7a8000p-6f, 0x1.70f7cep-16f),
+    (float2)(0x1.af0000p-6f, 0x1.7e4ac0p-16f),
+    (float2)(0x1.e38000p-6f, 0x1.ab2f40p-24f),
+    (float2)(0x1.0b8000p-5f, 0x1.00d40ap-16f),
+    (float2)(0x1.250000p-5f, 0x1.40b03ep-15f),
+    (float2)(0x1.3e8000p-5f, 0x1.446668p-15f),
+    (float2)(0x1.580000p-5f, 0x1.1c7758p-16f),
+    (float2)(0x1.710000p-5f, 0x1.20d09ep-15f),
+    (float2)(0x1.8a0000p-5f, 0x1.fd6f5cp-16f),
+    (float2)(0x1.a30000p-5f, 0x1.53ac12p-18f),
+    (float2)(0x1.bb8000p-5f, 0x1.4d02c6p-16f),
+    (float2)(0x1.d40000p-5f, 0x1.d5164ep-17f),
+    (float2)(0x1.ec0000p-5f, 0x1.991facp-15f),
+    (float2)(0x1.020000p-4f, 0x1.0a307cp-14f),
+    (float2)(0x1.0e0000p-4f, 0x1.e94ec0p-15f),
+    (float2)(0x1.1a0000p-4f, 0x1.1a22a8p-15f),
+    (float2)(0x1.258000p-4f, 0x1.d4857ap-14f),
+    (float2)(0x1.318000p-4f, 0x1.982ae2p-15f),
+    (float2)(0x1.3d0000p-4f, 0x1.74cd70p-14f),
+    (float2)(0x1.488000p-4f, 0x1.cfb476p-14f),
+    (float2)(0x1.540000p-4f, 0x1.ddcc64p-14f),
+    (float2)(0x1.5f8000p-4f, 0x1.a01222p-14f),
+    (float2)(0x1.6b0000p-4f, 0x1.177dbcp-14f),
+    (float2)(0x1.768000p-4f, 0x1.140a24p-16f),
+    (float2)(0x1.818000p-4f, 0x1.298f40p-14f),
+    (float2)(0x1.8c8000p-4f, 0x1.c60e20p-14f),
+    (float2)(0x1.980000p-4f, 0x1.b65052p-18f),
+    (float2)(0x1.a30000p-4f, 0x1.53ac12p-17f),
+    (float2)(0x1.ad8000p-4f, 0x1.f41d04p-14f),
+    (float2)(0x1.b88000p-4f, 0x1.7934eap-14f),
+    (float2)(0x1.c38000p-4f, 0x1.75252ep-15f),
+    (float2)(0x1.ce0000p-4f, 0x1.b90790p-14f),
+    (float2)(0x1.d90000p-4f, 0x1.d5866ap-16f),
+    (float2)(0x1.e38000p-4f, 0x1.e0d586p-15f),
+    (float2)(0x1.ee0000p-4f, 0x1.2ae984p-14f),
+    (float2)(0x1.f88000p-4f, 0x1.25a0d0p-14f),
+    (float2)(0x1.018000p-3f, 0x1.c2a064p-15f),
+    (float2)(0x1.068000p-3f, 0x1.2f59e8p-13f),
+    (float2)(0x1.0b8000p-3f, 0x1.cf424cp-13f),
+    (float2)(0x1.110000p-3f, 0x1.42f080p-15f),
+    (float2)(0x1.160000p-3f, 0x1.684156p-14f),
+    (float2)(0x1.1b0000p-3f, 0x1.f38f64p-14f),
+    (float2)(0x1.200000p-3f, 0x1.22077ap-13f),
+    (float2)(0x1.250000p-3f, 0x1.2d34d6p-13f),
+    (float2)(0x1.2a0000p-3f, 0x1.1ba328p-13f),
+    (float2)(0x1.2f0000p-3f, 0x1.db48e2p-14f),
+    (float2)(0x1.340000p-3f, 0x1.4712a0p-14f),
+    (float2)(0x1.390000p-3f, 0x1.ed0894p-16f),
+    (float2)(0x1.3d8000p-3f, 0x1.bc39b6p-13f),
+    (float2)(0x1.428000p-3f, 0x1.1f9ff8p-13f),
+    (float2)(0x1.478000p-3f, 0x1.a07d3ap-15f),
+    (float2)(0x1.4c0000p-3f, 0x1.9601fap-13f),
+    (float2)(0x1.510000p-3f, 0x1.532214p-14f),
+    (float2)(0x1.558000p-3f, 0x1.a31462p-13f),
+    (float2)(0x1.5a8000p-3f, 0x1.05a584p-14f),
+    (float2)(0x1.5f0000p-3f, 0x1.4911c8p-13f),
+    (float2)(0x1.638000p-3f, 0x1.f615fep-13f),
+    (float2)(0x1.688000p-3f, 0x1.1445b0p-14f),
+    (float2)(0x1.6d0000p-3f, 0x1.057abcp-13f),
+    (float2)(0x1.718000p-3f, 0x1.685f0ap-13f),
+    (float2)(0x1.760000p-3f, 0x1.b31022p-13f),
+    (float2)(0x1.7a8000p-3f, 0x1.e5cd62p-13f),
+    (float2)(0x1.7f8000p-3f, 0x1.aa6ca8p-22f),
+    (float2)(0x1.840000p-3f, 0x1.1944bcp-19f),
+    (float2)(0x1.880000p-3f, 0x1.f0b980p-13f),
+    (float2)(0x1.8c8000p-3f, 0x1.c60e20p-13f),
+    (float2)(0x1.910000p-3f, 0x1.849daep-13f),
+    (float2)(0x1.958000p-3f, 0x1.2ca202p-13f),
+    (float2)(0x1.9a0000p-3f, 0x1.7ca842p-14f),
+    (float2)(0x1.9e8000p-3f, 0x1.cf6180p-16f),
+    (float2)(0x1.a28000p-3f, 0x1.9fa186p-13f),
+    (float2)(0x1.a70000p-3f, 0x1.df5554p-14f),
+    (float2)(0x1.ab8000p-3f, 0x1.51eaccp-16f),
+    (float2)(0x1.af8000p-3f, 0x1.4f8e88p-13f),
+    (float2)(0x1.b40000p-3f, 0x1.7f49aap-15f),
+    (float2)(0x1.b80000p-3f, 0x1.5b3c72p-13f),
+    (float2)(0x1.bc8000p-3f, 0x1.07fd5cp-15f),
+    (float2)(0x1.c08000p-3f, 0x1.144d18p-13f),
+    (float2)(0x1.c48000p-3f, 0x1.d25700p-13f),
+    (float2)(0x1.c90000p-3f, 0x1.f1369ep-15f),
+    (float2)(0x1.cd0000p-3f, 0x1.1260fap-13f),
+    (float2)(0x1.d10000p-3f, 0x1.94c038p-13f),
+    (float2)(0x1.d58000p-3f, 0x1.ccfdb8p-20f),
+    (float2)(0x1.d98000p-3f, 0x1.7c70dap-15f),
+    (float2)(0x1.dd8000p-3f, 0x1.4ee87ap-14f),
+    (float2)(0x1.e18000p-3f, 0x1.b99d86p-14f),
+    (float2)(0x1.e58000p-3f, 0x1.feafc0p-14f),
+    (float2)(0x1.e98000p-3f, 0x1.0f3b16p-13f),
+    (float2)(0x1.ed8000p-3f, 0x1.0ca34cp-13f),
+    (float2)(0x1.f18000p-3f, 0x1.ef75b2p-14f),
+    (float2)(0x1.f58000p-3f, 0x1.a15704p-14f),
+    (float2)(0x1.f98000p-3f, 0x1.2f3cfap-14f),
+    (float2)(0x1.fd8000p-3f, 0x1.32f1dcp-15f),
+    (float2)(0x1.008000p-2f, 0x1.f02d90p-13f),
+    (float2)(0x1.028000p-2f, 0x1.821964p-13f),
+    (float2)(0x1.048000p-2f, 0x1.02a708p-13f),
+    (float2)(0x1.068000p-2f, 0x1.c7f450p-15f),
+    (float2)(0x1.080000p-2f, 0x1.e820cap-12f),
+    (float2)(0x1.0a0000p-2f, 0x1.8ecd14p-12f),
+    (float2)(0x1.0c0000p-2f, 0x1.2d15f4p-12f),
+    (float2)(0x1.0e0000p-2f, 0x1.861b72p-13f),
+    (float2)(0x1.100000p-2f, 0x1.4319e6p-14f),
+    (float2)(0x1.118000p-2f, 0x1.d6520ep-12f),
+    (float2)(0x1.138000p-2f, 0x1.53c218p-12f),
+    (float2)(0x1.158000p-2f, 0x1.925000p-13f),
+    (float2)(0x1.178000p-2f, 0x1.b4a7a2p-15f),
+    (float2)(0x1.190000p-2f, 0x1.9c19eep-12f),
+    (float2)(0x1.1b0000p-2f, 0x1.f38f64p-13f),
+    (float2)(0x1.1d0000p-2f, 0x1.3ebb32p-14f),
+    (float2)(0x1.1e8000p-2f, 0x1.9ddf96p-12f),
+    (float2)(0x1.208000p-2f, 0x1.c8d472p-13f),
+    (float2)(0x1.228000p-2f, 0x1.1af536p-15f),
+    (float2)(0x1.240000p-2f, 0x1.5acca0p-12f),
+    (float2)(0x1.260000p-2f, 0x1.158770p-13f),
+    (float2)(0x1.278000p-2f, 0x1.b35350p-12f),
+    (float2)(0x1.298000p-2f, 0x1.a91532p-13f),
+    (float2)(0x1.2b0000p-2f, 0x1.ee7896p-12f),
+    (float2)(0x1.2d0000p-2f, 0x1.012c1cp-12f),
+    (float2)(0x1.2f0000p-2f, 0x1.967ab4p-17f),
+    (float2)(0x1.308000p-2f, 0x1.111e3cp-12f),
+    (float2)(0x1.328000p-2f, 0x1.cf340ep-17f),
+    (float2)(0x1.340000p-2f, 0x1.04d426p-12f),
+)
+
+DECLARE_TABLE(float2, LOGE_TBL, 129,
+    (float2)(0x0.000000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.fe0000p-8f, 0x1.535882p-23f),
+    (float2)(0x1.fc0000p-7f, 0x1.5161f8p-20f),
+    (float2)(0x1.7b8000p-6f, 0x1.1b07d4p-18f),
+    (float2)(0x1.f82000p-6f, 0x1.361cf0p-19f),
+    (float2)(0x1.39e000p-5f, 0x1.0f73fcp-18f),
+    (float2)(0x1.774000p-5f, 0x1.63d8cap-19f),
+    (float2)(0x1.b42000p-5f, 0x1.bae232p-18f),
+    (float2)(0x1.f0a000p-5f, 0x1.86008ap-20f),
+    (float2)(0x1.164000p-4f, 0x1.36eea2p-16f),
+    (float2)(0x1.340000p-4f, 0x1.d7961ap-16f),
+    (float2)(0x1.51a000p-4f, 0x1.073f06p-16f),
+    (float2)(0x1.6f0000p-4f, 0x1.a515cap-17f),
+    (float2)(0x1.8c2000p-4f, 0x1.45d630p-16f),
+    (float2)(0x1.a92000p-4f, 0x1.b4e92ap-18f),
+    (float2)(0x1.c5e000p-4f, 0x1.523d6ep-18f),
+    (float2)(0x1.e26000p-4f, 0x1.076e2ap-16f),
+    (float2)(0x1.fec000p-4f, 0x1.2263b6p-17f),
+    (float2)(0x1.0d6000p-3f, 0x1.7e7cd0p-15f),
+    (float2)(0x1.1b6000p-3f, 0x1.2ad52ep-15f),
+    (float2)(0x1.294000p-3f, 0x1.52f81ep-15f),
+    (float2)(0x1.370000p-3f, 0x1.fc201ep-15f),
+    (float2)(0x1.44c000p-3f, 0x1.2b6ccap-15f),
+    (float2)(0x1.526000p-3f, 0x1.cbc742p-16f),
+    (float2)(0x1.5fe000p-3f, 0x1.3070a6p-15f),
+    (float2)(0x1.6d6000p-3f, 0x1.fce33ap-20f),
+    (float2)(0x1.7aa000p-3f, 0x1.890210p-15f),
+    (float2)(0x1.87e000p-3f, 0x1.a06520p-15f),
+    (float2)(0x1.952000p-3f, 0x1.6a73d0p-17f),
+    (float2)(0x1.a22000p-3f, 0x1.bc1fe2p-15f),
+    (float2)(0x1.af2000p-3f, 0x1.c94e80p-15f),
+    (float2)(0x1.bc2000p-3f, 0x1.0ce85ap-16f),
+    (float2)(0x1.c8e000p-3f, 0x1.f7c79ap-15f),
+    (float2)(0x1.d5c000p-3f, 0x1.0b5a7cp-18f),
+    (float2)(0x1.e26000p-3f, 0x1.076e2ap-15f),
+    (float2)(0x1.ef0000p-3f, 0x1.5b97b8p-16f),
+    (float2)(0x1.fb8000p-3f, 0x1.186d5ep-15f),
+    (float2)(0x1.040000p-2f, 0x1.2ca5a6p-17f),
+    (float2)(0x1.0a2000p-2f, 0x1.24e272p-14f),
+    (float2)(0x1.104000p-2f, 0x1.8bf9aep-14f),
+    (float2)(0x1.166000p-2f, 0x1.5cabaap-14f),
+    (float2)(0x1.1c8000p-2f, 0x1.3182d2p-15f),
+    (float2)(0x1.228000p-2f, 0x1.41fbcep-14f),
+    (float2)(0x1.288000p-2f, 0x1.5a13dep-14f),
+    (float2)(0x1.2e8000p-2f, 0x1.c575c2p-15f),
+    (float2)(0x1.346000p-2f, 0x1.dd9a98p-14f),
+    (float2)(0x1.3a6000p-2f, 0x1.3155a4p-16f),
+    (float2)(0x1.404000p-2f, 0x1.843434p-17f),
+    (float2)(0x1.460000p-2f, 0x1.8bc21cp-14f),
+    (float2)(0x1.4be000p-2f, 0x1.7e55dcp-16f),
+    (float2)(0x1.51a000p-2f, 0x1.5b0e5ap-15f),
+    (float2)(0x1.576000p-2f, 0x1.dc5d14p-16f),
+    (float2)(0x1.5d0000p-2f, 0x1.bdbf58p-14f),
+    (float2)(0x1.62c000p-2f, 0x1.05e572p-15f),
+    (float2)(0x1.686000p-2f, 0x1.903d36p-15f),
+    (float2)(0x1.6e0000p-2f, 0x1.1d5456p-15f),
+    (float2)(0x1.738000p-2f, 0x1.d7f6bap-14f),
+    (float2)(0x1.792000p-2f, 0x1.4abfbap-15f),
+    (float2)(0x1.7ea000p-2f, 0x1.f07704p-15f),
+    (float2)(0x1.842000p-2f, 0x1.a3b43cp-15f),
+    (float2)(0x1.89a000p-2f, 0x1.9c360ap-17f),
+    (float2)(0x1.8f0000p-2f, 0x1.1e8736p-14f),
+    (float2)(0x1.946000p-2f, 0x1.941c20p-14f),
+    (float2)(0x1.99c000p-2f, 0x1.958116p-14f),
+    (float2)(0x1.9f2000p-2f, 0x1.23ecbep-14f),
+    (float2)(0x1.a48000p-2f, 0x1.024396p-16f),
+    (float2)(0x1.a9c000p-2f, 0x1.d93534p-15f),
+    (float2)(0x1.af0000p-2f, 0x1.293246p-14f),
+    (float2)(0x1.b44000p-2f, 0x1.eef798p-15f),
+    (float2)(0x1.b98000p-2f, 0x1.625a4cp-16f),
+    (float2)(0x1.bea000p-2f, 0x1.4d9da6p-14f),
+    (float2)(0x1.c3c000p-2f, 0x1.d7a7ccp-14f),
+    (float2)(0x1.c8e000p-2f, 0x1.f7c79ap-14f),
+    (float2)(0x1.ce0000p-2f, 0x1.af0b84p-14f),
+    (float2)(0x1.d32000p-2f, 0x1.fcfc00p-15f),
+    (float2)(0x1.d82000p-2f, 0x1.e7258ap-14f),
+    (float2)(0x1.dd4000p-2f, 0x1.a81306p-16f),
+    (float2)(0x1.e24000p-2f, 0x1.1034f8p-15f),
+    (float2)(0x1.e74000p-2f, 0x1.09875ap-16f),
+    (float2)(0x1.ec2000p-2f, 0x1.99d246p-14f),
+    (float2)(0x1.f12000p-2f, 0x1.1ebf5ep-15f),
+    (float2)(0x1.f60000p-2f, 0x1.23fa70p-14f),
+    (float2)(0x1.fae000p-2f, 0x1.588f78p-14f),
+    (float2)(0x1.ffc000p-2f, 0x1.2e0856p-14f),
+    (float2)(0x1.024000p-1f, 0x1.52a5a4p-13f),
+    (float2)(0x1.04a000p-1f, 0x1.df9da8p-13f),
+    (float2)(0x1.072000p-1f, 0x1.f2e0e6p-16f),
+    (float2)(0x1.098000p-1f, 0x1.bd3d5cp-15f),
+    (float2)(0x1.0be000p-1f, 0x1.cb9094p-15f),
+    (float2)(0x1.0e4000p-1f, 0x1.261746p-15f),
+    (float2)(0x1.108000p-1f, 0x1.f39e2cp-13f),
+    (float2)(0x1.12e000p-1f, 0x1.719592p-13f),
+    (float2)(0x1.154000p-1f, 0x1.87a5e8p-14f),
+    (float2)(0x1.178000p-1f, 0x1.eabbd8p-13f),
+    (float2)(0x1.19e000p-1f, 0x1.cd68cep-14f),
+    (float2)(0x1.1c2000p-1f, 0x1.b81f70p-13f),
+    (float2)(0x1.1e8000p-1f, 0x1.7d79c0p-15f),
+    (float2)(0x1.20c000p-1f, 0x1.b9a324p-14f),
+    (float2)(0x1.230000p-1f, 0x1.30d7bep-13f),
+    (float2)(0x1.254000p-1f, 0x1.5bce98p-13f),
+    (float2)(0x1.278000p-1f, 0x1.5e1288p-13f),
+    (float2)(0x1.29c000p-1f, 0x1.37fec2p-13f),
+    (float2)(0x1.2c0000p-1f, 0x1.d3da88p-14f),
+    (float2)(0x1.2e4000p-1f, 0x1.d0db90p-15f),
+    (float2)(0x1.306000p-1f, 0x1.d7334ep-13f),
+    (float2)(0x1.32a000p-1f, 0x1.133912p-13f),
+    (float2)(0x1.34e000p-1f, 0x1.44ece6p-16f),
+    (float2)(0x1.370000p-1f, 0x1.17b546p-13f),
+    (float2)(0x1.392000p-1f, 0x1.e0d356p-13f),
+    (float2)(0x1.3b6000p-1f, 0x1.0893fep-14f),
+    (float2)(0x1.3d8000p-1f, 0x1.026a70p-13f),
+    (float2)(0x1.3fa000p-1f, 0x1.5b84d0p-13f),
+    (float2)(0x1.41c000p-1f, 0x1.8fe846p-13f),
+    (float2)(0x1.43e000p-1f, 0x1.9fe2f8p-13f),
+    (float2)(0x1.460000p-1f, 0x1.8bc21cp-13f),
+    (float2)(0x1.482000p-1f, 0x1.53d1eap-13f),
+    (float2)(0x1.4a4000p-1f, 0x1.f0bb60p-14f),
+    (float2)(0x1.4c6000p-1f, 0x1.e6bf32p-15f),
+    (float2)(0x1.4e6000p-1f, 0x1.d811b6p-13f),
+    (float2)(0x1.508000p-1f, 0x1.13cc00p-13f),
+    (float2)(0x1.52a000p-1f, 0x1.6932dep-16f),
+    (float2)(0x1.54a000p-1f, 0x1.246798p-13f),
+    (float2)(0x1.56a000p-1f, 0x1.f9d5b2p-13f),
+    (float2)(0x1.58c000p-1f, 0x1.5b6b9ap-14f),
+    (float2)(0x1.5ac000p-1f, 0x1.404c34p-13f),
+    (float2)(0x1.5cc000p-1f, 0x1.b1dc6cp-13f),
+    (float2)(0x1.5ee000p-1f, 0x1.54920ap-20f),
+    (float2)(0x1.60e000p-1f, 0x1.97a23cp-16f),
+    (float2)(0x1.62e000p-1f, 0x1.0bfbe8p-15f),
+)
+
+DECLARE_TABLE(float, LOG_INV_TBL, 129,
+    0x1.000000p+1f,
+    0x1.fc07f0p+0f,
+    0x1.f81f82p+0f,
+    0x1.f4465ap+0f,
+    0x1.f07c20p+0f,
+    0x1.ecc07cp+0f,
+    0x1.e9131ap+0f,
+    0x1.e573acp+0f,
+    0x1.e1e1e2p+0f,
+    0x1.de5d6ep+0f,
+    0x1.dae608p+0f,
+    0x1.d77b66p+0f,
+    0x1.d41d42p+0f,
+    0x1.d0cb58p+0f,
+    0x1.cd8568p+0f,
+    0x1.ca4b30p+0f,
+    0x1.c71c72p+0f,
+    0x1.c3f8f0p+0f,
+    0x1.c0e070p+0f,
+    0x1.bdd2b8p+0f,
+    0x1.bacf92p+0f,
+    0x1.b7d6c4p+0f,
+    0x1.b4e81cp+0f,
+    0x1.b20364p+0f,
+    0x1.af286cp+0f,
+    0x1.ac5702p+0f,
+    0x1.a98ef6p+0f,
+    0x1.a6d01ap+0f,
+    0x1.a41a42p+0f,
+    0x1.a16d40p+0f,
+    0x1.9ec8eap+0f,
+    0x1.9c2d14p+0f,
+    0x1.99999ap+0f,
+    0x1.970e50p+0f,
+    0x1.948b10p+0f,
+    0x1.920fb4p+0f,
+    0x1.8f9c18p+0f,
+    0x1.8d3018p+0f,
+    0x1.8acb90p+0f,
+    0x1.886e60p+0f,
+    0x1.861862p+0f,
+    0x1.83c978p+0f,
+    0x1.818182p+0f,
+    0x1.7f4060p+0f,
+    0x1.7d05f4p+0f,
+    0x1.7ad220p+0f,
+    0x1.78a4c8p+0f,
+    0x1.767dcep+0f,
+    0x1.745d18p+0f,
+    0x1.724288p+0f,
+    0x1.702e06p+0f,
+    0x1.6e1f76p+0f,
+    0x1.6c16c2p+0f,
+    0x1.6a13cep+0f,
+    0x1.681682p+0f,
+    0x1.661ec6p+0f,
+    0x1.642c86p+0f,
+    0x1.623fa8p+0f,
+    0x1.605816p+0f,
+    0x1.5e75bcp+0f,
+    0x1.5c9882p+0f,
+    0x1.5ac056p+0f,
+    0x1.58ed24p+0f,
+    0x1.571ed4p+0f,
+    0x1.555556p+0f,
+    0x1.539094p+0f,
+    0x1.51d07ep+0f,
+    0x1.501502p+0f,
+    0x1.4e5e0ap+0f,
+    0x1.4cab88p+0f,
+    0x1.4afd6ap+0f,
+    0x1.49539ep+0f,
+    0x1.47ae14p+0f,
+    0x1.460cbcp+0f,
+    0x1.446f86p+0f,
+    0x1.42d662p+0f,
+    0x1.414142p+0f,
+    0x1.3fb014p+0f,
+    0x1.3e22ccp+0f,
+    0x1.3c995ap+0f,
+    0x1.3b13b2p+0f,
+    0x1.3991c2p+0f,
+    0x1.381382p+0f,
+    0x1.3698e0p+0f,
+    0x1.3521d0p+0f,
+    0x1.33ae46p+0f,
+    0x1.323e34p+0f,
+    0x1.30d190p+0f,
+    0x1.2f684cp+0f,
+    0x1.2e025cp+0f,
+    0x1.2c9fb4p+0f,
+    0x1.2b404ap+0f,
+    0x1.29e412p+0f,
+    0x1.288b02p+0f,
+    0x1.27350cp+0f,
+    0x1.25e228p+0f,
+    0x1.24924ap+0f,
+    0x1.234568p+0f,
+    0x1.21fb78p+0f,
+    0x1.20b470p+0f,
+    0x1.1f7048p+0f,
+    0x1.1e2ef4p+0f,
+    0x1.1cf06ap+0f,
+    0x1.1bb4a4p+0f,
+    0x1.1a7b96p+0f,
+    0x1.194538p+0f,
+    0x1.181182p+0f,
+    0x1.16e068p+0f,
+    0x1.15b1e6p+0f,
+    0x1.1485f0p+0f,
+    0x1.135c82p+0f,
+    0x1.12358ep+0f,
+    0x1.111112p+0f,
+    0x1.0fef02p+0f,
+    0x1.0ecf56p+0f,
+    0x1.0db20ap+0f,
+    0x1.0c9714p+0f,
+    0x1.0b7e6ep+0f,
+    0x1.0a6810p+0f,
+    0x1.0953f4p+0f,
+    0x1.084210p+0f,
+    0x1.073260p+0f,
+    0x1.0624dep+0f,
+    0x1.051980p+0f,
+    0x1.041042p+0f,
+    0x1.03091cp+0f,
+    0x1.020408p+0f,
+    0x1.010102p+0f,
+    0x1.000000p+0f,
+)
+
+DECLARE_TABLE(float2, LOG_INV_TBL_EP, 129,
+    (float2)(0x1.000000p+1f, 0x0.000000p+0f),
+    (float2)(0x1.fc0000p+0f, 0x1.fc07f0p-14f),
+    (float2)(0x1.f80000p+0f, 0x1.f81f82p-12f),
+    (float2)(0x1.f40000p+0f, 0x1.196792p-10f),
+    (float2)(0x1.f00000p+0f, 0x1.f07c20p-10f),
+    (float2)(0x1.ec0000p+0f, 0x1.80f660p-9f),
+    (float2)(0x1.e80000p+0f, 0x1.131ac0p-8f),
+    (float2)(0x1.e40000p+0f, 0x1.73ac90p-8f),
+    (float2)(0x1.e00000p+0f, 0x1.e1e1e2p-8f),
+    (float2)(0x1.de0000p+0f, 0x1.75b8fep-10f),
+    (float2)(0x1.da0000p+0f, 0x1.cc0ed8p-9f),
+    (float2)(0x1.d60000p+0f, 0x1.7b654cp-8f),
+    (float2)(0x1.d40000p+0f, 0x1.d41d42p-12f),
+    (float2)(0x1.d00000p+0f, 0x1.96b1eep-9f),
+    (float2)(0x1.cc0000p+0f, 0x1.856890p-8f),
+    (float2)(0x1.ca0000p+0f, 0x1.2cc158p-10f),
+    (float2)(0x1.c60000p+0f, 0x1.1c71c8p-8f),
+    (float2)(0x1.c20000p+0f, 0x1.f8f01cp-8f),
+    (float2)(0x1.c00000p+0f, 0x1.c0e070p-9f),
+    (float2)(0x1.bc0000p+0f, 0x1.d2b89ap-8f),
+    (float2)(0x1.ba0000p+0f, 0x1.9f2298p-9f),
+    (float2)(0x1.b60000p+0f, 0x1.d6c3dep-8f),
+    (float2)(0x1.b40000p+0f, 0x1.d0369ep-9f),
+    (float2)(0x1.b20000p+0f, 0x1.b20364p-15f),
+    (float2)(0x1.ae0000p+0f, 0x1.286bcap-8f),
+    (float2)(0x1.ac0000p+0f, 0x1.5c06b2p-10f),
+    (float2)(0x1.a80000p+0f, 0x1.8ef606p-8f),
+    (float2)(0x1.a60000p+0f, 0x1.a034dap-9f),
+    (float2)(0x1.a40000p+0f, 0x1.a41a42p-12f),
+    (float2)(0x1.a00000p+0f, 0x1.6d3f98p-8f),
+    (float2)(0x1.9e0000p+0f, 0x1.91d2a2p-9f),
+    (float2)(0x1.9c0000p+0f, 0x1.68a772p-11f),
+    (float2)(0x1.980000p+0f, 0x1.99999ap-8f),
+    (float2)(0x1.960000p+0f, 0x1.0e4f80p-8f),
+    (float2)(0x1.940000p+0f, 0x1.161f9ap-9f),
+    (float2)(0x1.920000p+0f, 0x1.f693a2p-13f),
+    (float2)(0x1.8e0000p+0f, 0x1.9c18fap-8f),
+    (float2)(0x1.8c0000p+0f, 0x1.3018d4p-8f),
+    (float2)(0x1.8a0000p+0f, 0x1.9721eep-9f),
+    (float2)(0x1.880000p+0f, 0x1.b97c2ap-10f),
+    (float2)(0x1.860000p+0f, 0x1.861862p-12f),
+    (float2)(0x1.820000p+0f, 0x1.c977acp-8f),
+    (float2)(0x1.800000p+0f, 0x1.818182p-8f),
+    (float2)(0x1.7e0000p+0f, 0x1.405fd0p-8f),
+    (float2)(0x1.7c0000p+0f, 0x1.05f418p-8f),
+    (float2)(0x1.7a0000p+0f, 0x1.a4411cp-9f),
+    (float2)(0x1.780000p+0f, 0x1.499030p-9f),
+    (float2)(0x1.760000p+0f, 0x1.f7390ep-10f),
+    (float2)(0x1.740000p+0f, 0x1.745d18p-10f),
+    (float2)(0x1.720000p+0f, 0x1.0a1fd2p-10f),
+    (float2)(0x1.700000p+0f, 0x1.702e06p-11f),
+    (float2)(0x1.6e0000p+0f, 0x1.f76b44p-12f),
+    (float2)(0x1.6c0000p+0f, 0x1.6c16c2p-12f),
+    (float2)(0x1.6a0000p+0f, 0x1.3cd154p-12f),
+    (float2)(0x1.680000p+0f, 0x1.681682p-12f),
+    (float2)(0x1.660000p+0f, 0x1.ec6a52p-12f),
+    (float2)(0x1.640000p+0f, 0x1.642c86p-11f),
+    (float2)(0x1.620000p+0f, 0x1.fd3b80p-11f),
+    (float2)(0x1.600000p+0f, 0x1.605816p-10f),
+    (float2)(0x1.5e0000p+0f, 0x1.d6ee34p-10f),
+    (float2)(0x1.5c0000p+0f, 0x1.310572p-9f),
+    (float2)(0x1.5a0000p+0f, 0x1.80ad60p-9f),
+    (float2)(0x1.580000p+0f, 0x1.da4610p-9f),
+    (float2)(0x1.560000p+0f, 0x1.1ed3c6p-8f),
+    (float2)(0x1.540000p+0f, 0x1.555556p-8f),
+    (float2)(0x1.520000p+0f, 0x1.909490p-8f),
+    (float2)(0x1.500000p+0f, 0x1.d07eaep-8f),
+    (float2)(0x1.500000p+0f, 0x1.501502p-12f),
+    (float2)(0x1.4e0000p+0f, 0x1.7829ccp-10f),
+    (float2)(0x1.4c0000p+0f, 0x1.5710e4p-9f),
+    (float2)(0x1.4a0000p+0f, 0x1.fad40ap-9f),
+    (float2)(0x1.480000p+0f, 0x1.539e3cp-8f),
+    (float2)(0x1.460000p+0f, 0x1.ae147ap-8f),
+    (float2)(0x1.460000p+0f, 0x1.978fecp-13f),
+    (float2)(0x1.440000p+0f, 0x1.be1958p-10f),
+    (float2)(0x1.420000p+0f, 0x1.acc4bap-9f),
+    (float2)(0x1.400000p+0f, 0x1.414142p-8f),
+    (float2)(0x1.3e0000p+0f, 0x1.b013fcp-8f),
+    (float2)(0x1.3e0000p+0f, 0x1.165e72p-11f),
+    (float2)(0x1.3c0000p+0f, 0x1.32b490p-9f),
+    (float2)(0x1.3a0000p+0f, 0x1.13b13cp-8f),
+    (float2)(0x1.380000p+0f, 0x1.91c2c2p-8f),
+    (float2)(0x1.380000p+0f, 0x1.381382p-12f),
+    (float2)(0x1.360000p+0f, 0x1.31be7cp-9f),
+    (float2)(0x1.340000p+0f, 0x1.21cfb2p-8f),
+    (float2)(0x1.320000p+0f, 0x1.ae45b6p-8f),
+    (float2)(0x1.320000p+0f, 0x1.f1a516p-11f),
+    (float2)(0x1.300000p+0f, 0x1.a32026p-9f),
+    (float2)(0x1.2e0000p+0f, 0x1.684bdap-8f),
+    (float2)(0x1.2e0000p+0f, 0x1.2e025cp-15f),
+    (float2)(0x1.2c0000p+0f, 0x1.3f69b0p-9f),
+    (float2)(0x1.2a0000p+0f, 0x1.404ad0p-8f),
+    (float2)(0x1.280000p+0f, 0x1.e4129ep-8f),
+    (float2)(0x1.280000p+0f, 0x1.160252p-9f),
+    (float2)(0x1.260000p+0f, 0x1.350b88p-8f),
+    (float2)(0x1.240000p+0f, 0x1.e22708p-8f),
+    (float2)(0x1.240000p+0f, 0x1.24924ap-9f),
+    (float2)(0x1.220000p+0f, 0x1.45678ap-8f),
+    (float2)(0x1.200000p+0f, 0x1.fb7812p-8f),
+    (float2)(0x1.200000p+0f, 0x1.68e18cp-9f),
+    (float2)(0x1.1e0000p+0f, 0x1.7047dcp-8f),
+    (float2)(0x1.1e0000p+0f, 0x1.779da0p-11f),
+    (float2)(0x1.1c0000p+0f, 0x1.e0d5b4p-9f),
+    (float2)(0x1.1a0000p+0f, 0x1.b4a404p-8f),
+    (float2)(0x1.1a0000p+0f, 0x1.ee5846p-10f),
+    (float2)(0x1.180000p+0f, 0x1.453808p-8f),
+    (float2)(0x1.180000p+0f, 0x1.181182p-12f),
+    (float2)(0x1.160000p+0f, 0x1.c0d128p-9f),
+    (float2)(0x1.140000p+0f, 0x1.b1e5f8p-8f),
+    (float2)(0x1.140000p+0f, 0x1.0be1c2p-9f),
+    (float2)(0x1.120000p+0f, 0x1.5c8114p-8f),
+    (float2)(0x1.120000p+0f, 0x1.ac73aep-11f),
+    (float2)(0x1.100000p+0f, 0x1.111112p-8f),
+    (float2)(0x1.0e0000p+0f, 0x1.ef0110p-8f),
+    (float2)(0x1.0e0000p+0f, 0x1.9ead7cp-9f),
+    (float2)(0x1.0c0000p+0f, 0x1.b20a88p-8f),
+    (float2)(0x1.0c0000p+0f, 0x1.2e29f8p-9f),
+    (float2)(0x1.0a0000p+0f, 0x1.7e6ec2p-8f),
+    (float2)(0x1.0a0000p+0f, 0x1.a0429ap-10f),
+    (float2)(0x1.080000p+0f, 0x1.53f390p-8f),
+    (float2)(0x1.080000p+0f, 0x1.084210p-10f),
+    (float2)(0x1.060000p+0f, 0x1.3260a4p-8f),
+    (float2)(0x1.060000p+0f, 0x1.26e978p-11f),
+    (float2)(0x1.040000p+0f, 0x1.197f7ep-8f),
+    (float2)(0x1.040000p+0f, 0x1.041042p-12f),
+    (float2)(0x1.020000p+0f, 0x1.091b52p-8f),
+    (float2)(0x1.020000p+0f, 0x1.020408p-14f),
+    (float2)(0x1.000000p+0f, 0x1.010102p-8f),
+    (float2)(0x1.000000p+0f, 0x0.000000p+0f),
+)
+

diff --git a/amd-builtins/math32/logbF.cl b/amd-builtins/math32/logbF.cl
new file mode 100644
index 0000000..ba18634
--- /dev/null
+++ b/amd-builtins/math32/logbF.cl

@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+logb(float x)
+{
+    int ax = as_int(x) & EXSIGNBIT_SP32;
+    float s = -118 - clz(ax);
+    float r = (ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+    r = ax >= PINFBITPATT_SP32 ? as_float(ax) : r;
+    r = ax < 0x00800000 ? s : r;
+    r = ax == 0 ? as_float(NINFBITPATT_SP32) : r;
+    return r;
+}

diff --git a/amd-builtins/math32/madF.cl b/amd-builtins/math32/madF.cl
new file mode 100644
index 0000000..47c6736
--- /dev/null
+++ b/amd-builtins/math32/madF.cl

@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+mad(float x, float y, float z)
+{
+    return __amdil_mad_f32(x, y, z);
+}

diff --git a/amd-builtins/math32/math32.h b/amd-builtins/math32/math32.h
new file mode 100644
index 0000000..5d98944
--- /dev/null
+++ b/amd-builtins/math32/math32.h

@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MATH32_H
+#define MATH32_H 1
+
+extern __attribute__((pure)) float  __amdil_copysign_f32(float, float);
+extern __attribute__((pure)) float  __amdil_fma_f32(float, float, float);
+extern __attribute__((pure)) float  __amdil_mad_f32(float, float, float);
+extern __attribute__((pure)) float  __amdil_min_f32(float, float);
+extern __attribute__((pure)) float  __amdil_max_f32(float, float);
+extern __attribute__((pure)) float  __ftz_f32(float);
+extern __attribute__((pure)) float  __amdil_round_nearest_f32(float);
+extern __attribute__((pure)) float  __amdil_round_neginf_f32(float);
+extern __attribute__((pure)) float  __amdil_round_posinf_f32(float);
+extern __attribute__((pure)) float  __amdil_round_zero_f32(float);
+extern __attribute__((pure)) float  __amdil_fabs_f32(float);
+extern __attribute__((pure)) float __amdil_improved_div_f32(float, float);
+extern __attribute__((pure)) float  __amdil_fraction_f32(float);
+extern __attribute__((pure)) uint  __amdil_cmov_logical_i32(uint,  uint,  uint);
+extern __attribute__((pure)) uint __amdil_is_asic_id_i32(uint);
+extern __attribute__((pure)) uint __amdil_is_constant_f32(float);
+
+#define SNAN 0x001
+#define QNAN 0x002
+#define NINF 0x004
+#define NNOR 0x008
+#define NSUB 0x010
+#define NZER 0x020
+#define PZER 0x040
+#define PSUB 0x080
+#define PNOR 0x100
+#define PINF 0x200
+
+extern __attribute__((pure)) int __amdil_class_f32(float, int);
+
+// HSA definitions for these macros
+#define HAVE_HW_FMA32() (1)
+#define HAVE_BITALIGN() (0)
+#define HAVE_FAST_FMA32() (0)
+
+
+// Allow control over how division is done
+#define MATH_DIVIDE(X, Y) native_divide(X, Y)
+// #define MATH_DIVIDE(X,Y) ((X) / (Y))
+#define MATH_RECIP(X) native_recip(X)
+// #define MATH_RECIP(X) (1.0f / (X))
+
+// Allow control over square root
+#define MATH_SQRT(X) native_sqrt(X)
+
+// Force a flush of a subnormal to zero by feeding it through a functional unit
+#define FTZ(X) __ftz_f32(X)
+
+// Table stuff
+#define TABLE_SPACE __constant
+
+#define TABLE_MANGLE(NAME) __math32_##NAME
+
+#define USE_TABLE(TYPE,PTR,NAME) \
+    extern TABLE_SPACE TYPE TABLE_MANGLE(NAME) []; \
+    TABLE_SPACE TYPE * PTR = TABLE_MANGLE(NAME)
+
+#define DECLARE_TABLE(TYPE,NAME,LENGTH,...) \
+    TABLE_SPACE TYPE TABLE_MANGLE(NAME) [ LENGTH ] = { __VA_ARGS__ };
+
+/* These definitions, used by float functions,
+   are for both 32 and 64 bit machines */
+#define SIGNBIT_SP32      0x80000000
+#define EXSIGNBIT_SP32    0x7fffffff
+#define EXPBITS_SP32      0x7f800000
+#define MANTBITS_SP32     0x007fffff
+#define ONEEXPBITS_SP32   0x3f800000
+#define TWOEXPBITS_SP32   0x40000000
+#define HALFEXPBITS_SP32  0x3f000000
+#define IMPBIT_SP32       0x00800000
+#define QNANBITPATT_SP32  0x7fc00000
+#define INDEFBITPATT_SP32 0xffc00000
+#define PINFBITPATT_SP32  0x7f800000
+#define NINFBITPATT_SP32  0xff800000
+#define EXPBIAS_SP32      127
+#define EXPSHIFTBITS_SP32 23
+#define BIASEDEMIN_SP32   1
+#define EMIN_SP32         -126
+#define BIASEDEMAX_SP32   254
+#define EMAX_SP32         127
+#define LAMBDA_SP32       1.0e30
+#define MANTLENGTH_SP32   24
+#define BASEDIGITS_SP32   7
+
+#define ALIGNED(x)	__attribute__((aligned(x)))
+
+// Workaround a bug in the Apple linker that prevents inlining of large,
+// frequently-used static functions that only have the inline attribute.
+// Force all inline functions to be always_inlined.
+#ifdef USE_APPLE
+#define inline __attribute__((always_inline))
+#endif
+
+#endif /* MATH32_H */
+

diff --git a/amd-builtins/math32/maxmagF.cl b/amd-builtins/math32/maxmagF.cl
new file mode 100644
index 0000000..44015d9
--- /dev/null
+++ b/amd-builtins/math32/maxmagF.cl

@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+maxmag(float x, float y)
+{
+    int ix = as_int(x);
+    int iy = as_int(y);
+    int ax = ix & 0x7fffffff;
+    int ay = iy & 0x7fffffff;
+    ax |= -(ax > 0x7f800000);
+    ay |= -(ay > 0x7f800000);
+    return as_float((-(ax > ay) & ix) |
+	            (-(ay > ax) & iy) |
+		    (-(ax == ay) & ((ix & iy) | (ax & 0x00400000))));
+}
+

diff --git a/amd-builtins/math32/minmagF.cl b/amd-builtins/math32/minmagF.cl
new file mode 100644
index 0000000..73f9d86
--- /dev/null
+++ b/amd-builtins/math32/minmagF.cl

@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+minmag(float x, float y)
+{
+    int ix = as_int(x);
+    int iy = as_int(y);
+    int ax = ix & 0x7fffffff;
+    int ay = iy & 0x7fffffff;
+    return as_float((-(ax < ay) & ix) |
+	            (-(ay < ax) & iy) |
+		    (-(ax == ay) & (ix | iy)));
+}
+

diff --git a/amd-builtins/math32/modfF.cl b/amd-builtins/math32/modfF.cl
new file mode 100644
index 0000000..5369c0b
--- /dev/null
+++ b/amd-builtins/math32/modfF.cl

@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+modf(float x, float *iptr)
+{
+    int ux = as_int(x);
+    int e = ((ux >> 23) & 0xff) - 127;
+    int s = ux & 0x80000000;
+    int msk = 0xffffffff << (23 - e);
+    int i = msk & ux;
+    int r = as_uint(x - as_float(i)) | s;
+
+    r = e < 0 ? ux : r;
+    i = e < 0 ? s : i;
+
+    r = e >= 23 ? s : r;
+    i = e >= 23 ? ux : i;
+
+    r = (ux & 0x7fffffff) > 0x7f800000 ? ux : r;
+
+    *iptr = as_float(i);
+    return as_float(r);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline)) float
+modf(float x, __global float *iptr)
+{
+    float i;
+    float f = modf(x, &i);
+    *iptr = i;
+    return f;
+}
+
+__attribute__((overloadable, always_inline)) float
+modf(float x, __local float *iptr)
+{
+    float i;
+    float f = modf(x, &i);
+    *iptr = i;
+    return f;
+}
+#endif
+

diff --git a/amd-builtins/math32/nanF.cl b/amd-builtins/math32/nanF.cl
new file mode 100644
index 0000000..33cda39
--- /dev/null
+++ b/amd-builtins/math32/nanF.cl

@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, always_inline)) float
+nan(uint nancode)
+{
+    return as_float((nancode & 0xfffff) | 0x7fc00000);
+}
+

diff --git a/amd-builtins/math32/nextafterF.cl b/amd-builtins/math32/nextafterF.cl
new file mode 100644
index 0000000..e2e75af
--- /dev/null
+++ b/amd-builtins/math32/nextafterF.cl

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+nextafter(float x, float y)
+{
+    int ix = as_int(x);
+    int ax = ix & 0x7fffffff;
+    int mx = 0x80000000 - ix;
+    mx = ix < 0 ? mx : ix;
+    int iy = as_int(y);
+    int ay = iy & 0x7fffffff;
+    int my = 0x80000000 - iy;
+    my = iy < 0 ? my : iy;
+    int t = mx + (mx < my ? 1 : -1);
+    int r = 0x80000000 - t;
+    r = t < 0 ? r : t;
+    r = ax > 0x7f800000 ? ix : r;
+    r = ay > 0x7f800000 ? iy : r;
+    r = (ax|ay) == 0 | ix == iy ? iy : r;
+    return as_float(r);
+}

diff --git a/amd-builtins/math32/pdivF.cl b/amd-builtins/math32/pdivF.cl
new file mode 100644
index 0000000..83c3d1f
--- /dev/null
+++ b/amd-builtins/math32/pdivF.cl

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+extern __attribute__((pure)) float __hsail_div_f32(float, float);
+
+__attribute__((always_inline, weak)) float
+__precise_fp32_div_f32(float x, float y)
+{
+    return __hsail_div_f32(x,y);
+}
+
+
+__attribute__((always_inline, weak)) float2
+__precise_fp32_div_2f32(float2 x, float2 y)
+{
+    float2 ret;
+    ret.lo = __precise_fp32_div_f32(x.lo, y.lo);
+    ret.hi = __precise_fp32_div_f32(x.hi, y.hi);
+    return ret;
+}
+
+__attribute__((always_inline, weak)) float3
+__precise_fp32_div_3f32(float3 x, float3 y)
+{
+    float3 ret;
+    ret.xy = __precise_fp32_div_2f32(x.xy, y.xy);
+    ret.z = __precise_fp32_div_f32(x.z, y.z);
+    return ret;
+}
+
+__attribute__((always_inline, weak)) float4
+__precise_fp32_div_4f32(float4 x, float4 y)
+{
+    float4 ret;
+    ret.lo = __precise_fp32_div_2f32(x.lo, y.lo);
+    ret.hi = __precise_fp32_div_2f32(x.hi, y.hi);
+    return ret;
+}
+
+__attribute__((always_inline, weak)) float8
+__precise_fp32_div_8f32(float8 x, float8 y)
+{
+    float8 ret;
+    ret.lo = __precise_fp32_div_4f32(x.lo, y.lo);
+    ret.hi = __precise_fp32_div_4f32(x.hi, y.hi);
+    return ret;
+}
+
+__attribute__((always_inline, weak)) float16
+__precise_fp32_div_16f32(float16 x, float16 y)
+{
+    float16 ret;
+    ret.s0123 = __precise_fp32_div_4f32(x.s0123, y.s0123);
+    ret.s4567 = __precise_fp32_div_4f32(x.s4567, y.s4567);
+    ret.s89ab = __precise_fp32_div_4f32(x.s89ab, y.s89ab);
+    ret.scdef = __precise_fp32_div_4f32(x.scdef, y.scdef);
+    return ret;
+}

diff --git a/amd-builtins/math32/powF.cl b/amd-builtins/math32/powF.cl
new file mode 100644
index 0000000..de0710c
--- /dev/null
+++ b/amd-builtins/math32/powF.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_POW
+#include "powF_base.h"
+

diff --git a/amd-builtins/math32/powF_base.h b/amd-builtins/math32/powF_base.h
new file mode 100644
index 0000000..e15ed5f
--- /dev/null
+++ b/amd-builtins/math32/powF_base.h

@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+// compute pow using log and exp
+// x^y = exp(y * log(x))
+//
+// we take care not to lose precision in the intermediate steps
+//
+// When computing log, calculate it in splits,
+//
+// r = f * (p_invead + p_inv_tail)
+// r = rh + rt
+//
+// calculate log polynomial using r, in end addition, do
+// poly = poly + ((rh-r) + rt)
+//
+// lth = -r
+// ltt = ((xexp * log2_t) - poly) + logT
+// lt = lth + ltt
+//
+// lh = (xexp * log2_h) + logH
+// l = lh + lt
+//
+// Calculate final log answer as gh and gt,
+// gh = l & higher-half bits
+// gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
+//
+// yh = y & higher-half bits
+// yt = y - yh
+//
+// Before entering computation of exp,
+// vs = ((yt*gt + yt*gh) + yh*gt)
+// v = vs + yh*gh
+// vt = ((yh*gh - v) + vs)
+//
+// In calculation of exp, add vt to r that is used for poly
+// At the end of exp, do
+// ((((expT * poly) + expT) + expH*poly) + expH)
+
+__attribute__((overloadable)) float
+#if defined(COMPILING_POWR)
+powr(float x, float y)
+#elif defined(COMPILING_POWN)
+pown(float x, int ny)
+#elif defined(COMPILING_ROOTN)
+rootn(float x, int ny)
+#else
+pow(float x, float y)
+#endif
+{
+    USE_TABLE(float2, p_log, LOGE_TBL);
+    USE_TABLE(float2, p_inv, LOG_INV_TBL_EP);
+    USE_TABLE(float2, p_jby64, EXP_TBL_EP);
+
+#if defined(COMPILING_POWN)
+    float y = (float)ny;
+#elif defined(COMPILING_ROOTN)
+    float y = MATH_RECIP((float)ny);
+#endif
+
+    int ix = as_int(x);
+    int ax = ix & EXSIGNBIT_SP32;
+    int xpos = ix == ax;
+
+    int iy = as_int(y);
+    int ay = iy & EXSIGNBIT_SP32;
+    int ypos = iy == ay;
+
+    // Extra precise log calculation
+    // First handle case that x is close to 1
+    float r = 1.0f - as_float(ax);
+    int near1 = fabs(r) < 0x1.0p-4f;
+    float r2 = r*r;
+
+    // Coefficients are just 1/3, 1/4, 1/5 and 1/6
+    float poly = mad(r,
+                     mad(r,
+                         mad(r,
+                             mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+                             0x1.99999ap-3f),
+                         0x1.000000p-2f),
+                     0x1.555556p-2f);
+
+    poly *= r2*r;
+
+    float lth_near1 = -r2 * 0.5f;
+    float ltt_near1 = -poly;
+    float lt_near1 = lth_near1 + ltt_near1;
+    float lh_near1 = -r;
+    float l_near1 = lh_near1 + lt_near1;
+
+    // Computations for x not near 1
+    int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+    float mf = (float)m;
+    int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+    float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+    int c = m == -127;
+    int ixn = c ? ixs : ax;
+    float mfn = c ? mfs : mf;
+
+    int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+    // F - Y
+    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+    indx = indx >> 16;
+    float2 tv = p_inv[indx];
+    float rh = f * tv.s0;
+    float rt = f * tv.s1;
+    r = rh + rt;
+
+    poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
+    poly += (rh - r) + rt;
+
+    const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
+    const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+    tv = p_log[indx];
+    float lth = -r;
+    float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+    float lt = lth + ltt;
+    float lh = mad(mfn, LOG2_HEAD, tv.s0);
+    float l = lh + lt;
+
+    // Select near 1 or not
+    lth = near1 ? lth_near1 : lth;
+    ltt = near1 ? ltt_near1 : ltt;
+    lt = near1 ? lt_near1 : lt;
+    lh = near1 ? lh_near1 : lh;
+    l = near1 ? l_near1 : l;
+
+    float gh = as_float(as_int(l) & 0xfffff000);
+    float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+    float yh = as_float(iy & 0xfffff000);
+
+#if defined(COMPILING_POWN)
+    float yt = (float)(ny - (int)yh);
+#elif defined(COMPILING_ROOTN)
+    float fny = (float)ny;
+    float fnyh = as_float(as_int(fny) & 0xfffff000);
+    float fnyt = (float)(ny - (int)fnyh);
+    float yt = MATH_DIVIDE(mad(-fnyt, yh, mad(-fnyh, yh, 1.0f)), fny);
+#else
+    float yt = y - yh;
+#endif
+
+    float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
+    float ylogx = mad(yh, gh, ylogx_s);
+    float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
+
+    // Extra precise exp of ylogx
+    const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
+    int n = convert_int(ylogx * R_64_BY_LOG2);
+    float nf = (float) n;
+
+    int j = n & 0x3f;
+    m = n >> 6;
+    int m2 = m << EXPSHIFTBITS_SP32;
+
+    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
+    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
+    r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
+
+    // Truncated Taylor series for e^r
+    poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
+
+    tv = p_jby64[j];
+
+    float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
+    #if !defined(SUBNORMALS_SUPPORTED)
+		int explg = ((as_uint(expylogx) & EXPBITS_SP32 >> 23) - 127);
+		m = (23-(m + 149)) == 0 ? 1: m;
+		uint mantissa =  ((as_uint(expylogx) & MANTBITS_SP32)|IMPBIT_SP32) >> (23-(m + 149));
+ 		float sexpylogx = as_float(mantissa);
+    #else
+    	float sexpylogx = expylogx * as_float(0x1 << (m + 149));
+    #endif
+
+
+    float texpylogx = as_float(as_int(expylogx) + m2);
+    expylogx = m < -125 ? sexpylogx : texpylogx;
+
+    // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
+    expylogx = ylogx > 0x1.62e430p+6f | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f) ? as_float(PINFBITPATT_SP32) : expylogx;
+
+    // Result is 0 if ylogx < -149*log2
+    expylogx = ylogx <  -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+    // Classify y:
+    //   inty = 0 means not an integer.
+    //   inty = 1 means odd integer.
+    //   inty = 2 means even integer.
+
+#if defined(COMPILING_POWN) || defined(COMPILING_ROOTN)
+    int inty = 2 - (ny & 1);
+#else
+    int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
+    int mask = (1 << (24 - yexp)) - 1;
+    int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
+    int inty = yodd ? 1 : 2;
+    inty = (iy & mask) != 0 ? 0 : inty;
+    inty = yexp < 1 ? 0 : inty;
+    inty = yexp > 24 ? 2 : inty;
+#endif
+
+	float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+	expylogx = (inty == 1 & !xpos) ? signval : expylogx;
+    int ret = as_int(expylogx);
+
+    // Corner case handling
+
+#if defined COMPILING_POWR
+    ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+    ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
+    ret = ax == 0x3f800000 & ay < PINFBITPATT_SP32 ? 0x3f800000 : ret;
+    ret = ax == 0x3f800000 & ay == PINFBITPATT_SP32 ? QNANBITPATT_SP32 : ret;
+    ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
+    ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+    ret = ix < PINFBITPATT_SP32 & ay == 0 ? 0x3f800000 : ret;
+    ret = ax == PINFBITPATT_SP32 & !ypos ? 0 : ret;
+    ret = ax == PINFBITPATT_SP32 & ypos ? PINFBITPATT_SP32 : ret;
+    ret = ax == PINFBITPATT_SP32 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+    ret = ax == PINFBITPATT_SP32 & ay == 0 ? QNANBITPATT_SP32 : ret;
+    ret = ax == 0 & !ypos ? PINFBITPATT_SP32 : ret;
+    ret = ax == 0 & ypos ? 0 : ret;
+    ret = ax == 0 & ay == 0 ? QNANBITPATT_SP32 : ret;
+    ret = ax != 0 & !xpos ? QNANBITPATT_SP32 : ret;
+    ret = ax > PINFBITPATT_SP32 ? ix : ret;
+    ret = ay > PINFBITPATT_SP32 ? iy : ret;
+#elif defined COMPILING_POWN
+    int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+    ret = ax == 0 & !ypos & inty == 1 ? xinf : ret;
+    ret = ax == 0 & !ypos & inty == 2 ? PINFBITPATT_SP32 : ret;
+    ret = ax == 0 & ypos & inty == 2 ? 0 : ret;
+    int xzero = !xpos ? 0x80000000 : 0L;
+    ret = ax == 0 & ypos & inty == 1 ? xzero : ret;
+    ret = ix == NINFBITPATT_SP32 & !ypos & inty == 1 ? 0x80000000 : ret;
+    ret = ix == NINFBITPATT_SP32 & !ypos & inty != 1 ? 0 : ret;
+    ret = ix == NINFBITPATT_SP32 & ypos & inty == 1 ? NINFBITPATT_SP32 : ret;
+    ret = ix == NINFBITPATT_SP32 & ypos & inty != 1 ? PINFBITPATT_SP32 : ret;
+    ret = ix == PINFBITPATT_SP32 & !ypos ? 0 : ret;
+    ret = ix == PINFBITPATT_SP32 & ypos ? PINFBITPATT_SP32 : ret;
+    ret = ax > PINFBITPATT_SP32 ? ix : ret;
+    ret = ny == 0 ? 0x3f800000 : ret;
+#elif defined COMPILING_ROOTN
+    ret = !xpos & inty == 2 ? QNANBITPATT_SP32 : ret;
+    int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+    ret = ax == 0 & !ypos & inty == 1 ? xinf : ret;
+    ret = ax == 0 & !ypos & inty == 2 ? PINFBITPATT_SP32 : ret;
+    ret = ax == 0 & ypos & inty == 2 ? 0 : ret;
+    int xzero = xpos ? 0 : 0x80000000;
+    ret = ax == 0 & ypos & inty == 1 ? xzero : ret;
+    ret = ix == NINFBITPATT_SP32 & ypos & inty == 1 ? NINFBITPATT_SP32 : ret;
+    ret = ix == NINFBITPATT_SP32 & !ypos & inty == 1 ? 0x80000000 : ret;
+    ret = ix == PINFBITPATT_SP32 & !ypos ? 0 : ret;
+    ret = ix == PINFBITPATT_SP32 & ypos ? PINFBITPATT_SP32 : ret;
+    ret = ax > PINFBITPATT_SP32 ? ix : ret;
+    ret = ny == 0 ? QNANBITPATT_SP32 : ret;
+#else
+    ret = !xpos & inty == 0 ? QNANBITPATT_SP32 : ret;
+    ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+    ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
+    ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
+    ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+    int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+    ret = ax == 0 & !ypos & inty == 1 ? xinf : ret;
+    ret = ax == 0 & !ypos & inty != 1 ? PINFBITPATT_SP32 : ret;
+    int xzero = xpos ? 0 : 0x80000000;
+    ret = ax == 0 & ypos & inty == 1 ? xzero : ret;
+    ret = ax == 0 & ypos & inty != 1 ? 0 : ret;
+    ret = ax == 0 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+    ret = ix == 0xbf800000 & ay == PINFBITPATT_SP32 ? 0x3f800000 : ret;
+    ret = ix == NINFBITPATT_SP32 & !ypos & inty == 1 ? 0x80000000 : ret;
+    ret = ix == NINFBITPATT_SP32 & !ypos & inty != 1 ? 0 : ret;
+    ret = ix == NINFBITPATT_SP32 & ypos & inty == 1 ? NINFBITPATT_SP32 : ret;
+    ret = ix == NINFBITPATT_SP32 & ypos & inty != 1 ? PINFBITPATT_SP32 : ret;
+    ret = ix == PINFBITPATT_SP32 & !ypos ? 0 : ret;
+    ret = ix == PINFBITPATT_SP32 & ypos ? PINFBITPATT_SP32 : ret;
+    ret = ax > PINFBITPATT_SP32 ? ix : ret;
+    ret = ay > PINFBITPATT_SP32 ? iy : ret;
+    ret = ay == 0 ? 0x3f800000 : ret;
+    ret = ix == 0x3f800000 ? 0x3f800000 : ret;
+#endif
+
+    return as_float(ret);
+}
+

diff --git a/amd-builtins/math32/pownF.cl b/amd-builtins/math32/pownF.cl
new file mode 100644
index 0000000..f454464
--- /dev/null
+++ b/amd-builtins/math32/pownF.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_POWN
+#include "powF_base.h"
+

diff --git a/amd-builtins/math32/powrF.cl b/amd-builtins/math32/powrF.cl
new file mode 100644
index 0000000..7bd5743
--- /dev/null
+++ b/amd-builtins/math32/powrF.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_POWR
+#include "powF_base.h"
+

diff --git a/amd-builtins/math32/psqrtF.cl b/amd-builtins/math32/psqrtF.cl
new file mode 100644
index 0000000..0716fbc
--- /dev/null
+++ b/amd-builtins/math32/psqrtF.cl

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+extern __attribute__((pure)) float __hsail_sqrt_ftz_f32(float);
+
+__attribute__((always_inline, weak)) float
+__precise_fp32_sqrt_f32(float x)
+{
+    return __hsail_sqrt_ftz_f32(x);
+}
+
+
+__attribute__((always_inline, weak)) float2
+__precise_fp32_sqrt_2f32(float2 x)
+{
+    float2 ret;
+    ret.lo = __precise_fp32_sqrt_f32(x.lo);
+    ret.hi = __precise_fp32_sqrt_f32(x.hi);
+    return ret;
+}
+
+__attribute__((always_inline, weak)) float3
+__precise_fp32_sqrt_3f32(float3 x)
+{
+    float3 ret;
+    ret.xy = __precise_fp32_sqrt_2f32(x.xy);
+    ret.z = __precise_fp32_sqrt_f32(x.z);
+    return ret;
+}
+
+__attribute__((always_inline, weak)) float4
+__precise_fp32_sqrt_4f32(float4 x)
+{
+    float4 ret;
+    ret.lo = __precise_fp32_sqrt_2f32(x.lo);
+    ret.hi = __precise_fp32_sqrt_2f32(x.hi);
+    return ret;
+}
+
+__attribute__((always_inline, weak)) float8
+__precise_fp32_sqrt_8f32(float8 x)
+{
+    float8 ret;
+    ret.lo = __precise_fp32_sqrt_4f32(x.lo);
+    ret.hi = __precise_fp32_sqrt_4f32(x.hi);
+    return ret;
+}
+
+__attribute__((always_inline, weak)) float16
+__precise_fp32_sqrt_16f32(float16 x)
+{
+    float16 ret;
+    ret.s0123 = __precise_fp32_sqrt_4f32(x.s0123);
+    ret.s4567 = __precise_fp32_sqrt_4f32(x.s4567);
+    ret.s89ab = __precise_fp32_sqrt_4f32(x.s89ab);
+    ret.scdef = __precise_fp32_sqrt_4f32(x.scdef);
+    return ret;
+}

diff --git a/amd-builtins/math32/remainderF.cl b/amd-builtins/math32/remainderF.cl
new file mode 100644
index 0000000..5339704
--- /dev/null
+++ b/amd-builtins/math32/remainderF.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_REMAINDER
+#include "remainderF.h"
+

diff --git a/amd-builtins/math32/remainderF.h b/amd-builtins/math32/remainderF.h
new file mode 100644
index 0000000..088a10f
--- /dev/null
+++ b/amd-builtins/math32/remainderF.h

@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+#if !defined(SUBNORMALS_SUPPORTED)
+static inline float
+scaleFullRangef32(float y, float t)
+{
+    float ay, ty, r = 0;
+    int k, iiy, iy, exp_iy0, exp_iy, manty, signy, miy;
+    int delta, shift, ir;
+
+    ay = fabs(t);
+    k = ay > 1024 ? 1024 : (int) ay;
+    k = t < 0 ? -k : k;
+    t = (float) k;
+
+    iiy = as_int(y);
+    iy = iiy & EXSIGNBIT_SP32;
+    signy = iiy & SIGNBIT_SP32;
+    ay = as_float(iy);
+
+    exp_iy0 = iy & EXPBITS_SP32;
+    manty = iy & MANTBITS_SP32;
+
+    //sub-normal
+    ty = exp_iy0 == 0 ? (float) manty : as_float(iy);
+    k = exp_iy0 == 0 ? k - 149 : k;
+    ay = ty;
+    iy = as_int(ay);
+    exp_iy0 = iy & EXPBITS_SP32;
+    exp_iy = (exp_iy0 >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+    // add k to y's exponent
+    r = as_float(iy + (k << EXPSHIFTBITS_SP32));
+    r = (exp_iy + k) > 127 ? as_float(PINFBITPATT_SP32) : r;
+    // add k to y's exponent
+    delta = -126 - (exp_iy + k);
+
+    // sub-normal
+    miy = iy & MANTBITS_SP32;
+    miy |= IMPBIT_SP32;
+    shift = delta > 23 ? 24 : delta;
+    shift = delta < 0 ? 0 : shift;
+    miy >>= shift;
+    r = delta > 0 ? as_float(miy) : r;
+    r = t > (float) (2 * EMAX_SP32) ? as_float(PINFBITPATT_SP32) : r;
+    ir = as_int(r);
+    r = ir <= PINFBITPATT_SP32 ? as_float(as_int(r) | signy) : r;
+    return r;
+}
+
+/* Scales the float x by 2.0**n.
+Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
+static inline float
+scaleFloat_2(float x, int n)
+{
+    float t1, t2;
+    int n1, n2;
+    n1 = n / 2;
+    n2 = n - n1;
+    /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
+    t1 = as_float((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
+    t2 = as_float((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
+    return (x * t1) * t2;
+}
+
+/* Scales the float x by 2.0**n.
+   Assumes EMIN <= n <= EMAX, though this condition is not checked. */
+static inline float
+scaleFloat_1(float x, int n)
+{
+    float t;
+    /* Construct the number t = 2.0**n */
+    t = as_float((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
+    return x * t;
+}
+
+/* Computes the exact product of x and y, the result being the
+nearly double length number (z,zz) */
+static inline void
+mul12f(float x, float y, float *z, float *zz)
+{
+    float hx, tx, hy, ty;
+    // Split x into hx (head) and tx (tail). Do the same for y.
+    uint u;
+    u = as_uint(x);
+    u &= 0xfffff000;
+    hx = as_float(u);
+    tx = x - hx;
+    u = as_uint(y);
+    u &= 0xfffff000;
+    hy = as_float(u);
+    ty = y - hy;
+    *z = x * y;
+    *zz = (((hx * hy - *z) + hx * ty) + tx * hy) + tx * ty;
+}
+
+#endif //SUBNORMALS_SUPPORTED
+
+#if defined(COMPILING_FMOD)
+__attribute__((overloadable)) float
+fmod(float x, float y)
+#elif defined(COMPILING_REMQUO)
+__attribute__((overloadable)) float
+remquo(float x, float y, int *quo)
+#else
+__attribute__((overloadable)) float
+remainder(float x, float y)
+#endif
+{
+#if !defined(SUBNORMALS_SUPPORTED)
+
+    const int loop_scale = 12;
+    const float fscale = 1.0f / (float) (1 << loop_scale);
+
+    int ntimes;
+    float ret = 0;
+    int ui_x, ui_y, ui_ax, ui_ay, xexp, yexp, signx;
+    float af_x, af_y, af_ybase, fx, fxp, fxm, fy, w, scale, t, c, cc, v;
+    float yscale, scaled_w, saved_w, div, sdiv, ratio, sratio, fxexp, sub_fx;
+    int iw_scaled, wexp, it, i, ifx, ex, ey;;
+    float xr, xr0, xr_base, yr;
+    uint q;
+
+    ui_x = as_int(x);
+    ui_y = as_int(y);
+    ui_ax = ui_x & EXSIGNBIT_SP32;
+    ui_ay = ui_y & EXSIGNBIT_SP32;
+
+    /* special case handle */
+#if defined(COMPILING_REMQUO)
+    *quo = 0;
+#endif
+    if (ui_ax > PINFBITPATT_SP32)
+	return x;
+    if (ui_ax == PINFBITPATT_SP32)
+	return as_float(QNANBITPATT_SP32);
+    if (ui_ay > PINFBITPATT_SP32)
+	return y;
+    if (ui_ay == PINFBITPATT_SP32)
+	return x;
+    if (ui_ay == 0 && ui_ax == 0)
+	return as_float(QNANBITPATT_SP32);
+    if (ui_ax == 0)
+	return x;
+    if (ui_ay == 0)
+	return as_float(QNANBITPATT_SP32);
+
+    signx = ui_x & SIGNBIT_SP32;
+#if defined(COMPILING_REMQUO)
+    int signy = ui_y & SIGNBIT_SP32;
+#endif
+    af_x = as_float(ui_ax);
+    af_ybase = af_y = as_float(ui_ay);
+    yexp = (int) ((ui_y & EXPBITS_SP32) >> EXPSHIFTBITS_SP32);
+
+    yscale = (float) ((yexp < 48 && ui_ay != 0) ? (48 - yexp) : 0);
+    if (yscale != 0) {
+	af_y = scaleFullRangef32(af_ybase, yscale);
+    }
+
+    ui_y = as_int(af_y);
+    yexp = (int) ((ui_y & EXPBITS_SP32) >> EXPSHIFTBITS_SP32);
+    xexp = (int) ((ui_x & EXPBITS_SP32) >> EXPSHIFTBITS_SP32);
+    fx = af_x;
+    fy = af_y;
+
+    /* Set ntimes to the number of times we need to do a
+       partial remainder. If the exponent of x is an exact multiple
+       of 24 larger than the exponent of y, and the mantissa of x is
+       less than the mantissa of y, ntimes will be one too large
+       but it doesn't matter - it just means that we'll go round
+       the loop below one extra time. */
+    ntimes = (xexp - yexp) / loop_scale;
+    ntimes = xexp <= yexp ? 0 : ntimes;
+
+    /* Set w = y * 2^(ntimes*loop_scale) */
+    w = scaleFloat_2(fy, ntimes*loop_scale);
+    w = ntimes == 0 ? fy : w;
+
+    /* Set scale = 2^(-loop_scale) */
+    scale = ntimes == 0 ? 1.0f : fscale;
+
+    // make sure recip does not overflow
+    wexp = (int) ((as_int(w) & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+    saved_w = w;
+    scaled_w = scaleFloat_1(w, -14);
+    iw_scaled = wexp > 105 & wexp <= 127;
+    w = iw_scaled & ntimes > 0 ? scaled_w : w;
+
+    /* Each time round the loop we compute a partial remainder.
+       This is done by subtracting a large multiple of w
+       from x each time, where w is a scaled up version of y.
+       The subtraction can be performed exactly when performed
+       in double precision, and the result at each stage can
+       fit exactly in a single precision number. */
+    for (i = 0; i < ntimes; i++) {
+	/* Set fx = fx - w * t, where t is equal to trunc(dx/w). */
+	div = __amdil_improved_div_f32(fx, w);
+	sdiv = scaleFloat_1(div, -14);
+	div = iw_scaled ? sdiv : div;
+	t = floor(div);
+	w = saved_w;
+	iw_scaled = 0;
+
+	/* At this point, t may be one too large due to rounding of fx/w */
+
+	/* Compute w * t in quad precision */
+	mul12f(w, t, &c, &cc);
+
+	/* Subtract w * t from fx */
+	v = fx - c;
+	fx = v + (((fx - v) - c) - cc);
+
+	/* If t was one too large, fx will be negative. Add back one w */
+	/* It might be possible to speed up this loop by finding
+	   a way to compute correctly truncated t directly from fx and w.
+	   We would then avoid the need for this check on negative fx. */
+	fxp = fx + w;
+	fxm = fx - w;
+	fx = fx < 0.0f ? fxp : fx;
+	fx = fx >= w ? fxm : fx;
+
+	/* Scale w down by for the next iteration */
+	w *= scale;
+	saved_w = w;
+    }
+
+    /* One more time */
+    // iw = as_int(w);
+    ifx = as_int(fx);
+    fxexp = (int) ((ifx & EXPBITS_SP32) >> EXPSHIFTBITS_SP32);
+    // wexp = (int) ((iw & EXPBITS_SP32) >> EXPSHIFTBITS_SP32);
+    sub_fx = fx;
+    // make sure recip does not overflow
+    wexp = (int) ((as_int(w) & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+    saved_w = w;
+    scaled_w = scaleFloat_1(w, -14);
+    iw_scaled = wexp > 105 & wexp <= 127;
+    w = iw_scaled ? scaled_w : w;
+    ratio = __amdil_improved_div_f32(fx, w);
+    sratio = scaleFloat_1(ratio, -14);
+    ratio = iw_scaled ? sratio : ratio;
+    t = floor(ratio);
+    it = (int) t;
+
+    w = saved_w;
+    mul12f(w, t, &c, &cc);
+
+    v = fx - c;
+    fx = v + (((fx - v) - c) - cc);
+
+    if (fx < 0.0f) {
+	fx += w;
+	it--;
+    }
+
+    if (fx >= w) {
+	fx -= w;
+	it++;
+    }
+
+    // sub-normal fax
+    fx = fxexp == 0 ? sub_fx : fx;
+
+#if !defined(COMPILING_FMOD)
+    float scaleback = 0;
+#endif
+
+    // in case fx == 0 and we'got a divisor
+    it = (yscale > 30) ? 0 : ((unsigned int) it << (int) yscale);
+
+    if (as_int(fx) != 0 && yscale != 0) {
+	xr = fx;
+	xr_base = fx;
+	yr = af_ybase;
+	q = 0;
+	ex = ilogb(fx);
+	ey = ilogb(af_ybase);
+
+	yr = (float) scaleFullRangef32(af_ybase, (float) -ey);
+	xr = (float) scaleFullRangef32(fx, (float) -ex);
+
+	for (i = ex - ey; i > 0; i--) {
+	    q <<= 1;
+	    xr0 = xr;
+	    xr = (xr0 >= yr) ? xr0 - yr : xr0;
+	    q = (xr0 >= yr) ? q + 1 : q;
+	    xr += xr;
+	}
+	q <<= 1;
+	xr0 = xr;
+	xr = (xr0 >= yr) ? xr0 - yr : xr0;
+	q = (xr0 >= yr) ? q + 1 : q;
+	xr = scaleFullRangef32(xr, (float) ey);
+
+	fx = (ex - ey >= 0) ? xr : xr_base;
+#if !defined(COMPILING_FMOD)
+	q = (ex - ey >= 0) ? q : 0;
+	it += q;
+
+	xexp = (int) ((as_int(fx) & EXPBITS_SP32) >> EXPSHIFTBITS_SP32);
+
+	w = af_ybase;
+	if (xexp < 24) {
+	    fx = scaleFullRangef32(fx, 48);
+	    w = scaleFullRangef32(af_ybase, 48);
+	    scaleback = -48;
+	}
+#endif
+    }
+#if !defined(COMPILING_FMOD)
+    /* At this point, dx lies in the range [0,dy) */
+    /* For the remainder function, we need to adjust dx
+       so that it lies in the range (-y/2, y/2] by carefully
+       subtracting w (== fy == y) if necessary. */
+    if (fx * 2.f > w || ((fx * 2.f == w) && (it & 1))) {
+	fx -= w;
+	it++;
+    }
+    if (scaleback != 0) {
+	fx = scaleFullRangef32(fx, scaleback);
+    }
+#endif
+
+    ret = (signx) ? as_float(as_int(fx) ^ SIGNBIT_SP32) : fx;
+#if defined(COMPILING_REMQUO)
+    it = (signx ^ signy) ? -it : it;
+    *quo = it;
+#endif
+
+    return ret;
+
+
+#else
+
+    x = FTZ(x);
+    y = FTZ(y);
+
+    int ux = as_int(x);
+    int ax = ux & EXSIGNBIT_SP32;
+    float xa = as_float(ax);
+    int sx = ux ^ ax;
+    int ex = ax >> EXPSHIFTBITS_SP32;
+
+    int uy = as_int(y);
+    int ay = uy & EXSIGNBIT_SP32;
+    float ya = as_float(ay);
+#if defined COMPILING_REMQUO
+    int sy = uy ^ ay;
+#endif
+    int ey = ay >> EXPSHIFTBITS_SP32;
+
+    float xr = as_float(0x3f800000 | (ax & 0x007fffff));
+    float yr = as_float(0x3f800000 | (ay & 0x007fffff));
+    int c;
+    int k = ex - ey;
+
+#if defined COMPILING_FMOD
+# define BIT c = xr >= yr; xr -= c ? yr : 0.0f; xr += xr
+#else
+    uint q = 0;
+# define BIT c = xr >= yr; q = (q << 1) | c; xr -= c ? yr : 0.0f; xr += xr
+#endif
+
+    while (k > 3) {
+	BIT;
+	BIT;
+	BIT;
+	BIT;
+	k -= 4;
+    }
+
+    while (k > 0) {
+	BIT;
+	--k;
+    }
+
+#if !defined COMPILING_FMOD
+    c = xr > yr;
+    q = (q << 1) | c;
+#else
+    c = xr >= yr;
+#endif
+    xr -= c ? yr : 0.0f;
+
+    int lt = ex < ey;
+
+#if !defined COMPILING_FMOD
+    q = lt ? 0 : q;
+#endif
+    xr = lt ? xa : xr;
+    yr = lt ? ya : yr;
+
+#if !defined COMPILING_FMOD
+    c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & (q & 0x1) == 0x1);
+    xr -= c ? yr : 0.0f;
+    q += c;
+#endif
+
+    float s = as_float(ey << EXPSHIFTBITS_SP32);
+    xr *= lt ? 1.0f : s;
+
+#if defined COMPILING_REMQUO
+    int qsgn = sx == sy ? 1 : -1;
+    int quot = (q & 0x7f) * qsgn;
+#endif
+
+    c = ax == ay;
+#if defined COMPILING_REMQUO
+    quot = c ? qsgn : quot;
+#endif
+    xr = c ? 0.0f : xr;
+
+    xr = as_float(sx ^ as_int(xr));
+
+    c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | ay == 0;
+#if defined COMPILING_REMQUO
+    quot = c ? 0 : quot;
+#endif
+    xr = c ? as_float(QNANBITPATT_SP32) : xr;
+
+#if defined COMPILING_REMQUO
+    *quo = quot;
+#endif
+
+    return xr;
+
+#endif
+}
+

diff --git a/amd-builtins/math32/remainderF_piby2.h b/amd-builtins/math32/remainderF_piby2.h
new file mode 100644
index 0000000..881ec0d
--- /dev/null
+++ b/amd-builtins/math32/remainderF_piby2.h

@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+static inline void
+fullMulS(float *hi, float *lo, float a, float b, float bh, float bt)
+{
+    if (HAVE_HW_FMA32()) {
+        float ph = a * b;
+        *hi = ph;
+        *lo = fma(a, b, -ph);
+    } else {
+        float ah = as_float(as_uint(a) & 0xfffff000U);
+        float at = a - ah;
+        float ph = a * b;
+        float pt = mad(at, bt, mad(at, bh, mad(ah, bt, mad(ah, bh, -ph))));
+        *hi = ph;
+        *lo = pt;
+    }
+}
+
+static inline float
+removePi2S(float *hi, float *lo, float x)
+{
+    // 72 bits of pi/2
+    const float fpiby2_1 = (float) 0xC90FDA / 0x1.0p+23f;
+    const float fpiby2_1_h = (float) 0xC90 / 0x1.0p+11f;
+    const float fpiby2_1_t = (float) 0xFDA / 0x1.0p+23f;
+
+    const float fpiby2_2 = (float) 0xA22168 / 0x1.0p+47f;
+    const float fpiby2_2_h = (float) 0xA22 / 0x1.0p+35f;
+    const float fpiby2_2_t = (float) 0x168 / 0x1.0p+47f;
+
+    const float fpiby2_3 = (float) 0xC234C4 / 0x1.0p+71f;
+    const float fpiby2_3_h = (float) 0xC23 / 0x1.0p+59f;
+    const float fpiby2_3_t = (float) 0x4C4 / 0x1.0p+71f;
+
+    const float twobypi = 0x1.45f306p-1f;
+
+    float fnpi2 = trunc(mad(x, twobypi, 0.5f));
+
+    // subtract n * pi/2 from x
+    float rhead, rtail;
+    fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
+    float v = x - rhead;
+    float rem = v + (((x - v) - rhead) - rtail);
+
+    float rhead2, rtail2;
+    fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
+    v = rem - rhead2;
+    rem = v + (((rem - v) - rhead2) - rtail2);
+
+    float rhead3, rtail3;
+    fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
+    v = rem - rhead3;
+
+    *hi = v + ((rem - v) - rhead3);
+    *lo = -rtail3;
+    return fnpi2;
+}
+
+static inline int
+argReductionSmallS(float *r, float *rr, float x)
+{
+    float fnpi2 = removePi2S(r, rr, x);
+    return (int)fnpi2 & 0x3;
+}
+
+extern uint __amdil_umad_u32(uint, uint, uint);
+extern uint __amdil_bitalign_i32(uint, uint, uint);
+
+static inline uint
+bitalign(uint hi, uint lo, uint shift)
+{
+    if (HAVE_BITALIGN())
+        return __amdil_bitalign_i32(hi, lo, shift);
+    else
+        return (hi << (32 - shift)) | (lo >> shift);
+}
+
+
+#define FULL_MUL(A, B, HI, LO) \
+    LO = A * B; \
+    HI = mul_hi(A, B)
+
+#define FULL_MAD(A, B, C, HI, LO) \
+    LO = __amdil_umad_u32(A, B, C); \
+    HI = mul_hi(A, B); \
+    HI += LO < C
+
+static inline int
+argReductionLargeS(float *r, float *rr, float x)
+{
+    int xe = (int)(as_uint(x) >> 23) - 127;
+    uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
+
+    // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB
+    const uint b6 = 0xA2F9836EU;
+    const uint b5 = 0x4E441529U;
+    const uint b4 = 0xFC2757D1U;
+    const uint b3 = 0xF534DDC0U;
+    const uint b2 = 0xDB629599U;
+    const uint b1 = 0x3C439041U;
+    const uint b0 = 0xFE5163ABU;
+
+    uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
+
+    FULL_MUL(xm, b0, c0, p0);
+    FULL_MAD(xm, b1, c0, c1, p1);
+    FULL_MAD(xm, b2, c1, c0, p2);
+    FULL_MAD(xm, b3, c0, c1, p3);
+    FULL_MAD(xm, b4, c1, c0, p4);
+    FULL_MAD(xm, b5, c0, c1, p5);
+    FULL_MAD(xm, b6, c1, p7, p6);
+
+    uint fbits = 224 + 23 - xe;
+
+    // shift amount to get 2 lsb of integer part at top 2 bits
+    //   min: 25 (xe=18) max: 134 (xe=127)
+    uint shift = 256U - 2 - fbits;
+
+    // Shift by up to 134/32 = 4 words
+    int c = shift > 31;
+    p7 = c ? p6 : p7;
+    p6 = c ? p5 : p6;
+    p5 = c ? p4 : p5;
+    p4 = c ? p3 : p4;
+    p3 = c ? p2 : p3;
+    p2 = c ? p1 : p2;
+    p1 = c ? p0 : p1;
+    shift -= (-c) & 32;
+
+    c = shift > 31;
+    p7 = c ? p6 : p7;
+    p6 = c ? p5 : p6;
+    p5 = c ? p4 : p5;
+    p4 = c ? p3 : p4;
+    p3 = c ? p2 : p3;
+    p2 = c ? p1 : p2;
+    shift -= (-c) & 32;
+
+    c = shift > 31;
+    p7 = c ? p6 : p7;
+    p6 = c ? p5 : p6;
+    p5 = c ? p4 : p5;
+    p4 = c ? p3 : p4;
+    p3 = c ? p2 : p3;
+    shift -= (-c) & 32;
+
+    c = shift > 31;
+    p7 = c ? p6 : p7;
+    p6 = c ? p5 : p6;
+    p5 = c ? p4 : p5;
+    p4 = c ? p3 : p4;
+    shift -= (-c) & 32;
+
+    // bitalign cannot handle a shift of 32
+    c = shift > 0;
+    shift = 32 - shift;
+    uint t7 = bitalign(p7, p6, shift);
+    uint t6 = bitalign(p6, p5, shift);
+    uint t5 = bitalign(p5, p4, shift);
+    p7 = c ? t7 : p7;
+    p6 = c ? t6 : p6;
+    p5 = c ? t5 : p5;
+
+    // Get 2 lsb of int part and msb of fraction
+    int i = p7 >> 29;
+
+    // Scoot up 2 more bits so only fraction remains
+    p7 = bitalign(p7, p6, 30);
+    p6 = bitalign(p6, p5, 30);
+    p5 = bitalign(p5, p4, 30);
+
+    // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
+    uint flip = i & 1 ? 0xffffffffU : 0U;
+    uint sign = i & 1 ? 0x80000000U : 0U;
+    p7 = p7 ^ flip;
+    p6 = p6 ^ flip;
+    p5 = p5 ^ flip;
+
+    // Find exponent and shift away leading zeroes and hidden bit
+    xe = clz(p7) + 1;
+    shift = 32 - xe;
+    p7 = bitalign(p7, p6, shift);
+    p6 = bitalign(p6, p5, shift);
+
+    // Most significant part of fraction
+    float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
+
+    // Shift out bits we captured on q1
+    p7 = bitalign(p7, p6, 32-23);
+
+    // Get 24 more bits of fraction in another float, there are not long strings of zeroes here
+    int xxe = clz(p7) + 1;
+    p7 = bitalign(p7, p6, 32-xxe);
+    float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
+
+    // At this point, the fraction q1 + q0 is correct to at least 48 bits
+    // Now we need to multiply the fraction by pi/2
+    // This loses us about 4 bits
+    // pi/2 = C90 FDA A22 168 C23 4C4
+
+    const float pio2h = (float)0xc90fda / 0x1.0p+23f;
+    const float pio2hh = (float)0xc90 / 0x1.0p+11f;
+    const float pio2ht = (float)0xfda / 0x1.0p+23f;
+    const float pio2t = (float)0xa22168 / 0x1.0p+47f;
+
+    float rh, rt;
+
+    if (HAVE_HW_FMA32()) {
+        rh = q1 * pio2h;
+        rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
+    } else {
+        float q1h = as_float(as_uint(q1) & 0xfffff000);
+        float q1t = q1 - q1h;
+        rh = q1 * pio2h;
+        rt = mad(q1t, pio2ht, mad(q1t, pio2hh, mad(q1h, pio2ht, mad(q1h, pio2hh, -rh))));
+        rt = mad(q0, pio2h, mad(q1, pio2t, rt));
+    }
+
+    float t = rh + rt;
+    rt = rt - (t - rh);
+
+    *r = t;
+    *rr = rt;
+    return ((i >> 1) + (i & 1)) & 0x3;
+}
+
+static inline int
+argReductionS(float *r, float *rr, float x)
+{
+    if (x < 0x1.0p+23f)
+        return argReductionSmallS(r, rr, x);
+    else
+        return argReductionLargeS(r, rr, x);
+}
+

diff --git a/amd-builtins/math32/remquoF.cl b/amd-builtins/math32/remquoF.cl
new file mode 100644
index 0000000..4e8b94a
--- /dev/null
+++ b/amd-builtins/math32/remquoF.cl

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_REMQUO
+#include "remainderF.h"
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline)) float
+remquo(float x, float y, __global int *quo)
+{
+    int q;
+    float r = remquo(x, y, &q);
+    *quo = q;
+    return r;
+}
+
+__attribute__((overloadable, always_inline)) float
+remquo(float x, float y, __local int *quo)
+{
+    int q;
+    float r = remquo(x, y, &q);
+    *quo = q;
+    return r;
+}
+#endif
+

diff --git a/amd-builtins/math32/rintF.cl b/amd-builtins/math32/rintF.cl
new file mode 100644
index 0000000..caf2663
--- /dev/null
+++ b/amd-builtins/math32/rintF.cl

@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+// Compile only scalar defintion for HSA
+
+__attribute__((overloadable, always_inline)) float
+rint(float x)
+{
+    return __amdil_round_nearest_f32(x);
+}

diff --git a/amd-builtins/math32/rootnF.cl b/amd-builtins/math32/rootnF.cl
new file mode 100644
index 0000000..2d13aa1
--- /dev/null
+++ b/amd-builtins/math32/rootnF.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_ROOTN
+#include "powF_base.h"
+

diff --git a/amd-builtins/math32/roundF.cl b/amd-builtins/math32/roundF.cl
new file mode 100644
index 0000000..548ba6a
--- /dev/null
+++ b/amd-builtins/math32/roundF.cl

@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+round(float x)
+{
+    float t = trunc(x);
+    float d = fabs(x - t);
+    float o = as_float((as_int(x) & 0x80000000) | 0x3f800000);
+    return t + (d >= 0.5f ? o : 0.0f);
+}
+

diff --git a/amd-builtins/math32/rsqrtF.cl b/amd-builtins/math32/rsqrtF.cl
new file mode 100644
index 0000000..dc4ab37
--- /dev/null
+++ b/amd-builtins/math32/rsqrtF.cl

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline, weak)) float
+rsqrt(float x)
+{
+#if !defined(SUBNORMALS_SUPPORTED)
+    int i = as_int(x);
+    int ai = i & 0x7fffffff;
+    int d = ai > 0 & ai < 0x00800000;
+    // scale subnormal by 2^26 without multiplying to avoid input flush
+    float s = as_float(i | 0x0d800000) - 0x1.0p-100F;
+    x = d ? s : x;
+    x = native_rsqrt(x);
+    x *= d ? 0x1.0p+13F : 1.0F;
+    return x;
+#else //SUBNORMALS_SUPPORTED
+    return native_rsqrt(x);
+#endif
+
+
+}

diff --git a/amd-builtins/math32/sinF.cl b/amd-builtins/math32/sinF.cl
new file mode 100644
index 0000000..8d02067
--- /dev/null
+++ b/amd-builtins/math32/sinF.cl

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#if 1
+#include "math32.h"
+#include "remainderF_piby2.h"
+#include "sincosF_piby4.h"
+#endif
+
+__attribute__((overloadable, pure)) float
+sin(float x)
+{
+#if 1
+    int ix = as_int(x);
+    int ax = ix & 0x7fffffff;
+    float dx = as_float(ax);
+
+    float r0, r1;
+    int regn = argReductionS(&r0, &r1, dx);
+
+    float ss = sinf_piby4_new(r0, r1);
+    float cc = cosf_piby4_new(r0, r1);
+
+    float s = (regn & 1) != 0 ? cc : ss;
+    s = as_float(as_int(s) ^ ((regn > 1) << 31) ^ (ix ^ ax));
+
+    s = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : s;
+
+	//Subnormals
+	s = x == 0. ? x : s;
+    return s;
+#else
+  // TODO_HSA: Using native_sin for now.
+  return native_sin(x);
+#endif
+}
+

diff --git a/amd-builtins/math32/sincosF.cl b/amd-builtins/math32/sincosF.cl
new file mode 100644
index 0000000..7c98df2
--- /dev/null
+++ b/amd-builtins/math32/sincosF.cl

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+#include "remainderF_piby2.h"
+#include "sincosF_piby4.h"
+
+__attribute__ ((overloadable, always_inline)) float
+sincos(float x, float *result_cos)
+{
+    int ix = as_int(x);
+    int ax = ix & 0x7fffffff;
+    float dx = as_float(ax);
+
+    // Almost all args should be caught in the first branch
+    float r0, r1;
+    int regn = argReductionS(&r0, &r1, dx);
+
+    float ss = sinf_piby4_new(r0, r1);
+    float cc = cosf_piby4_new(r0, r1);
+
+    int flip = (regn > 1) << 31;
+    float s = (regn & 1) != 0 ? cc : ss;
+    s = as_float(as_int(s) ^ flip ^ (ax ^ ix));
+    ss = -ss;
+    float c = (regn & 1) != 0 ? ss : cc;
+    c = as_float(as_int(c) ^ flip);
+
+    c = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : c;
+    s = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : s;
+
+    *result_cos = c;
+    return s;
+}
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__ ((overloadable, always_inline)) float
+sincos(float x, __local float *result_cos)
+{
+    float c;
+    float s = sincos(x, &c);
+    *result_cos = c;
+    return s;
+}
+
+__attribute__ ((overloadable, always_inline)) float
+sincos(float x, __global float *result_cos)
+{
+    float c;
+    float s = sincos(x, &c);
+    *result_cos = c;
+    return s;
+}
+#endif

diff --git a/amd-builtins/math32/sincosF_piby4.h b/amd-builtins/math32/sincosF_piby4.h
new file mode 100644
index 0000000..f490f0e
--- /dev/null
+++ b/amd-builtins/math32/sincosF_piby4.h

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+static inline float sinf_piby4_new(float x, float y)
+{
+    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+    // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+    // = x * f(w)
+    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+    // We use a minimax approximation of (f(w) - 1) / w
+    // because this produces an expansion in even powers of x.
+
+    const float c1 = -0.1666666666e0f;
+    const float c2 = 0.8333331876e-2f;
+    const float c3 = -0.198400874e-3f;
+    const float c4 = 0.272500015e-5f;
+    const float c5 = -2.5050759689e-08f; // 0xb2d72f34
+    const float c6 = 1.5896910177e-10f;	 // 0x2f2ec9d3
+
+    float z = x * x;
+    float v = z * x;
+    float r = mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2);
+    float ret = x - mad(v, -c1, mad(z, mad(y, 0.5f, -v*r), -y));
+
+    return ret;
+}
+
+static inline float cosf_piby4_new(float x, float y)
+{
+    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+    // = f(w)
+    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+    // because this produces an expansion in even powers of x.
+
+    const float c1 = 0.416666666e-1f;
+    const float c2 = -0.138888876e-2f;
+    const float c3 = 0.248006008e-4f;
+    const float c4 = -0.2730101334e-6f;
+    const float c5 = 2.0875723372e-09f;	 // 0x310f74f6
+    const float c6 = -1.1359647598e-11f; // 0xad47d74e
+
+    float z = x * x;
+    float r = z * mad(z, mad(z, mad(z, mad(z, mad(z, c6,  c5), c4), c3), c2), c1);
+
+    // if |x| < 0.3
+    float qx = 0.0f;
+
+    int ix = as_int(x) & EXSIGNBIT_SP32;
+
+    //  0.78125 > |x| >= 0.3
+    float xby4 = as_float(ix - 0x01000000);
+    qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
+
+    // x > 0.78125
+    qx = ix > 0x3f480000 ? 0.28125f : qx;
+
+    float hz = mad(z, 0.5f, -qx);
+    float a = 1.0f - qx;
+    float ret = a - (hz - mad(z, r, -x*y));
+    return ret;
+}
+

diff --git a/amd-builtins/math32/sincospiF_piby4.h b/amd-builtins/math32/sincospiF_piby4.h
new file mode 100644
index 0000000..0bf4f75
--- /dev/null
+++ b/amd-builtins/math32/sincospiF_piby4.h

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4]
+static inline float2
+sincosf_piby4(float x)
+{
+    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+    // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+    // = x * f(w)
+    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+    // We use a minimax approximation of (f(w) - 1) / w
+    // because this produces an expansion in even powers of x.
+
+    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+    // = f(w)
+    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+    // because this produces an expansion in even powers of x.
+
+    const float sc1 = -0.166666666638608441788607926e0F;
+    const float sc2 =  0.833333187633086262120839299e-2F;
+    const float sc3 = -0.198400874359527693921333720e-3F;
+    const float sc4 =  0.272500015145584081596826911e-5F;
+
+    const float cc1 =  0.41666666664325175238031e-1F;
+    const float cc2 = -0.13888887673175665567647e-2F;
+    const float cc3 =  0.24800600878112441958053e-4F;
+    const float cc4 = -0.27301013343179832472841e-6F;
+
+    float x2 = x * x;
+
+    float2 ret;
+    ret.x = mad(x*x2, mad(x2, mad(x2, mad(x2, sc4, sc3), sc2), sc1), x);
+    ret.y = mad(x2*x2, mad(x2, mad(x2, mad(x2, cc4, cc3), cc2), cc1), mad(x2, -0.5f, 1.0f));
+    return ret;
+}
+

diff --git a/amd-builtins/math32/sinhF.cl b/amd-builtins/math32/sinhF.cl
new file mode 100644
index 0000000..dc4c15b
--- /dev/null
+++ b/amd-builtins/math32/sinhF.cl

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+sinh(float x)
+{
+    // After dealing with special cases the computation is split into regions as follows.
+    // abs(x) >= max_sinh_arg:
+    // sinh(x) = sign(x)*Inf
+    // abs(x) >= small_threshold:
+    // sinh(x) = sign(x)*exp(abs(x))/2 computed using the splitexp and scaleDouble functions as for exp_amd().
+    // abs(x) < small_threshold:
+    // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    // sinh(x) is then sign(x)*z.
+
+    // Tabulated values of sinh(i) and cosh(i) for i = 0,...,36.
+    USE_TABLE(float2, p_tbl, SINHCOSH_TBL);
+
+    const float max_sinh_arg = 0x1.65a9fap+6f;
+    const float small_threshold = 0x1.0a2b24p+3f;
+
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint xs = ux ^ aux;
+    float y = as_float(aux);
+
+    // We find the integer part y0 of y and the increment dy = y - y0. We then compute
+    // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
+    // where sinh(y0) and cosh(y0) are tabulated above.
+    int ind = (int) y;
+    ind = (uint)ind > 36U ? 0 : ind;
+
+    float dy = y - ind;
+    float dy2 = dy * dy;
+
+    float sdy = mad(dy2,
+                    mad(dy2,
+                        mad(dy2,
+                            mad(dy2,
+                                mad(dy2,
+                                    mad(dy2, 0.7746188980094184251527126e-12f, 0.160576793121939886190847e-9f),
+                                    0.250521176994133472333666e-7f),
+                                0.275573191913636406057211e-5f),
+                            0.198412698413242405162014e-3f),
+                         0.833333333333329931873097e-2f),
+                    0.166666666666666667013899e0f);
+    sdy = mad(sdy, dy*dy2, dy);
+
+    float cdy = mad(dy2,
+                    mad(dy2,
+                        mad(dy2,
+                            mad(dy2,
+                                mad(dy2,
+                                    mad(dy2, 0.1163921388172173692062032e-10f, 0.208744349831471353536305e-8f),
+                                    0.275573350756016588011357e-6f),
+                                0.248015872460622433115785e-4f),
+                            0.138888888889814854814536e-2f),
+                        0.416666666666660876512776e-1f),
+                    0.500000000000000005911074e0f);
+    cdy = mad(cdy, dy2, 1.0f);
+
+    float2 tv = p_tbl[ind];
+    float z = mad(tv.s1, sdy, tv.s0 * cdy);
+    z = as_float(xs | as_uint(z));
+
+    // When y is large enough so that the negative exponential is negligible,
+    // so sinh(y) is approximated by sign(x)*exp(y)/2.
+    float t = exp(y - 0x1.62e500p-1f);
+    float zsmall = mad(0x1.a0210ep-18f, t, t);
+    zsmall = as_float(xs | as_uint(zsmall));
+    z = y >= small_threshold ? zsmall : z;
+
+    // Corner cases
+    float zinf = as_float(PINFBITPATT_SP32 | xs);
+    z = y >= max_sinh_arg ? zinf : z;
+    z = aux > PINFBITPATT_SP32 | aux < 0x38800000U ? x : z;
+
+    return z;
+}
+

diff --git a/amd-builtins/math32/sinhcoshF_table.h b/amd-builtins/math32/sinhcoshF_table.h
new file mode 100644
index 0000000..29189c8
--- /dev/null
+++ b/amd-builtins/math32/sinhcoshF_table.h

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+

+

+DECLARE_TABLE(float2, SINHCOSH_TBL, 37,

+    (float2)(0x0.000000p+0f, 0x1.000000p+0f),

+    (float2)(0x1.2cd9fcp+0f, 0x1.8b0756p+0f),

+    (float2)(0x1.d03cf6p+1f, 0x1.e18fa0p+1f),

+    (float2)(0x1.40926ep+3f, 0x1.422a4ap+3f),

+    (float2)(0x1.b4a380p+4f, 0x1.b4ee86p+4f),

+    (float2)(0x1.28d016p+6f, 0x1.28d6fcp+6f),

+    (float2)(0x1.936d22p+7f, 0x1.936e68p+7f),

+    (float2)(0x1.122876p+9f, 0x1.122894p+9f),

+    (float2)(0x1.749ea6p+10f, 0x1.749eaap+10f),

+    (float2)(0x1.fa7158p+11f, 0x1.fa7158p+11f),

+    (float2)(0x1.5829dcp+13f, 0x1.5829dep+13f),

+    (float2)(0x1.d3c448p+14f, 0x1.d3c448p+14f),

+    (float2)(0x1.3de166p+16f, 0x1.3de166p+16f),

+    (float2)(0x1.b00b5ap+17f, 0x1.b00b5ap+17f),

+    (float2)(0x1.259ac4p+19f, 0x1.259ac4p+19f),

+    (float2)(0x1.8f0ccap+20f, 0x1.8f0ccap+20f),

+    (float2)(0x1.0f2ebep+22f, 0x1.0f2ebep+22f),

+    (float2)(0x1.709348p+23f, 0x1.709348p+23f),

+    (float2)(0x1.f4f220p+24f, 0x1.f4f220p+24f),

+    (float2)(0x1.546d90p+26f, 0x1.546d90p+26f),

+    (float2)(0x1.ceb088p+27f, 0x1.ceb088p+27f),

+    (float2)(0x1.3a6e20p+29f, 0x1.3a6e20p+29f),

+    (float2)(0x1.ab5adcp+30f, 0x1.ab5adcp+30f),

+    (float2)(0x1.226af4p+32f, 0x1.226af4p+32f),

+    (float2)(0x1.8ab7fcp+33f, 0x1.8ab7fcp+33f),

+    (float2)(0x1.0c3d3ap+35f, 0x1.0c3d3ap+35f),

+    (float2)(0x1.6c9326p+36f, 0x1.6c9326p+36f),

+    (float2)(0x1.ef8230p+37f, 0x1.ef8230p+37f),

+    (float2)(0x1.50bba4p+39f, 0x1.50bba4p+39f),

+    (float2)(0x1.c9aae4p+40f, 0x1.c9aae4p+40f),

+    (float2)(0x1.370470p+42f, 0x1.370470p+42f),

+    (float2)(0x1.a6b766p+43f, 0x1.a6b766p+43f),

+    (float2)(0x1.1f43fcp+45f, 0x1.1f43fcp+45f),

+    (float2)(0x1.866f34p+46f, 0x1.866f34p+46f),

+    (float2)(0x1.0953e2p+48f, 0x1.0953e2p+48f),

+    (float2)(0x1.689e22p+49f, 0x1.689e22p+49f),

+    (float2)(0x1.ea215ap+50f, 0x1.ea215ap+50f),

+)

+


diff --git a/amd-builtins/math32/sinpiF.cl b/amd-builtins/math32/sinpiF.cl
new file mode 100644
index 0000000..759143d
--- /dev/null
+++ b/amd-builtins/math32/sinpiF.cl

@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+#include "sincospiF_piby4.h"
+#if !defined(SUBNORMALS_SUPPORTED)
+#include "floattointconversion.h"
+#endif //SUBNORMALS_SUPPORTED
+
+
+__attribute__((overloadable)) float
+sinpi(float x)
+{
+    const float pi = 3.1415926535897932F;
+
+    int ix = as_int(x);
+    int xsgn = ix & 0x80000000;
+    ix ^= xsgn;
+    float ax = as_float(ix);
+    int iax = (int)ax;
+    float r = ax - iax;
+    int xodd = xsgn ^ (iax & 0x1 ? 0x80000000 : 0);
+
+    // Initialize with return for +-Inf and NaN
+    int ir = 0x7fc00000;
+
+    // 2^23 <= |x| < Inf, the result is always integer
+    ir = ix < 0x7f800000 ? xsgn : ir;
+
+    // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+    // r < 1.0
+    float a = 1.0f - r;
+    int e = 0;
+
+    // r <= 0.75
+    int c = r <= 0.75f;
+    a = c ? r - 0.5f : a;
+    e = c ? 1 : e;
+
+    // r < 0.5
+    c = r < 0.5f;
+    a = c ? 0.5f - r : a;
+
+    // 0 < r <= 0.25
+    c = r <= 0.25f;
+    a = c ? r : a;
+    e = c ? 0 : e;
+
+    float2 t = sincosf_piby4(a * pi);
+    int jr = xodd ^ as_int(e ? t.hi : t.lo);
+
+    ir = ix < 0x4b000000 ? jr : ir;
+
+
+#if !defined(SUBNORMALS_SUPPORTED)
+	if(ax <= 0.)
+	{
+		double d = float_uint_to_double(as_uint(x));
+		ir = (double_to_float_uint(d*pi));
+	}
+#endif //SUBNORMALS_SUPPORTED
+
+    return as_float(ir);
+}
+

diff --git a/amd-builtins/math32/sqrtF.cl b/amd-builtins/math32/sqrtF.cl
new file mode 100644
index 0000000..db13fe0
--- /dev/null
+++ b/amd-builtins/math32/sqrtF.cl

@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline, weak)) float
+sqrt(float x)
+{
+#if !defined(SUBNORMALS_SUPPORTED)
+    int i = as_int(x);
+    int ai = i & 0x7fffffff;
+    int d = ai > 0 & ai < 0x00800000;
+    // scale subnormal by 2^26 without multiplying to avoid input flush
+    float s = as_float(i | 0x0d800000) - 0x1.0p-100F;
+    x = d ? s : x;
+    x = MATH_SQRT(x);
+    x *= d ? 0x1.0p-13F : 1.0F;
+    return x;
+#else //SUBNORMALS_SUPPORTED
+    return native_sqrt(x);
+#endif
+}

diff --git a/amd-builtins/math32/tables32.cl b/amd-builtins/math32/tables32.cl
new file mode 100644
index 0000000..d453f5f
--- /dev/null
+++ b/amd-builtins/math32/tables32.cl

@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+#include "expF_table.h"
+
+#include "logF_table.h"
+
+#include "sinhcoshF_table.h"
+
+#include "atan2F_table.h"
+
+#include "cbrtF_table.h"
+

diff --git a/amd-builtins/math32/tanF.cl b/amd-builtins/math32/tanF.cl
new file mode 100644
index 0000000..16fa44f
--- /dev/null
+++ b/amd-builtins/math32/tanF.cl

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+#include "remainderF_piby2.h"
+#include "tanF_piby4.h"
+
+__attribute__((overloadable)) float
+tan(float x)
+{
+    int ix = as_int(x);
+    int ax = ix & 0x7fffffff;
+    float dx = as_float(ax);
+
+    float r0, r1;
+    int regn = argReductionS(&r0, &r1, dx);
+
+    float t = tanf_piby4_new(r0 + r1, regn);
+    t = as_float(as_int(t) ^ (ix ^ ax));
+
+    t = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : t;
+	//Take care of subnormals
+	t = (x == 0.) ? x : t;
+    return t;
+}
+

diff --git a/amd-builtins/math32/tanF_piby4.h b/amd-builtins/math32/tanF_piby4.h
new file mode 100644
index 0000000..2d64b0f
--- /dev/null
+++ b/amd-builtins/math32/tanF_piby4.h

@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+static inline float
+tanf_piby4_new(float x, int regn)
+{
+    // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
+    float r = x * x;
+
+    float a = mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
+
+    float b = mad(r,
+	          mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
+	          1.15588821434688393452299f);
+
+    float t = mad(x*r, __amdil_improved_div_f32(a, b), x);
+    float tr = -MATH_RECIP(t);
+
+    return regn & 1 ? tr : t;
+}

diff --git a/amd-builtins/math32/tanhF.cl b/amd-builtins/math32/tanhF.cl
new file mode 100644
index 0000000..42cde35
--- /dev/null
+++ b/amd-builtins/math32/tanhF.cl

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable)) float
+tanh(float x)
+{
+    // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+    // to the following three formulae:
+    // 1.  (exp(x) - exp(-x))/(exp(x) + exp(-x))
+    // 2.  (1 - (2/(exp(2*x) + 1 )))
+    // 3.  (exp(2*x) - 1)/(exp(2*x) + 1)
+    // but computationally, some formulae are better on some ranges.
+
+    const float large_threshold = 0x1.0a2b24p+3f;
+
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint xs = ux ^ aux;
+
+    float y = as_float(aux);
+    float y2 = y*y;
+
+    float a1 = mad(y2,
+                   mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F),
+                   -0.28192806108402678e0F);
+    float b1 = mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F);
+
+    float a2 = mad(y2,
+                   mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F),
+                   -0.24069858695196524e0F);
+    float b2 = mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F);
+
+    int c = y < 0.9f;
+    float a = c ? a1 : a2;
+    float b = c ? b1 : b2;
+    float zlo = mad(MATH_DIVIDE(a, b), y*y2, y);
+
+    float p = exp(2.0f * y) + 1.0f;
+    float zhi = 1.0F - MATH_DIVIDE(2.0F, p);
+
+    float z = y <= 1.0f ? zlo : zhi;
+    z = as_float(xs | as_uint(z));
+
+    // Edge cases
+    float sone = as_float(0x3f800000U | xs);
+    z = y > large_threshold ? sone : z;
+    z = aux < 0x39000000 | aux > 0x7f800000 ? x : z;
+
+    return z;
+}
+

diff --git a/amd-builtins/math32/tanpiF.cl b/amd-builtins/math32/tanpiF.cl
new file mode 100644
index 0000000..ff13a19
--- /dev/null
+++ b/amd-builtins/math32/tanpiF.cl

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+#if !defined(SUBNORMALS_SUPPORTED)
+#include "floattointconversion.h"
+#endif //SUBNORMALS_SUPPORTED
+
+static inline float2
+tanf_piby4(float x)
+{
+    // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4]
+    float r = x*x;
+    float a = mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
+    float b = mad(r, mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
+	                                                   1.15588821434688393452299f);
+    float t = mad(x*r, MATH_DIVIDE(a,b), x);
+    return (float2)(t, -MATH_RECIP(t));
+}
+
+__attribute__((overloadable)) float
+tanpi(float x)
+{
+    const float pi = 3.1415926535897932F;
+
+    int ix = as_int(x);
+    int xsgn = ix & 0x80000000;
+    int xnsgn = xsgn ^ 0x80000000;
+    ix ^= xsgn;
+    float ax = as_float(ix);
+    int iax = (int)ax;
+    float r = ax - iax;
+    int xodd = xsgn ^ (iax & 0x1 ? 0x80000000 : 0);
+
+    // Initialize with return for +-Inf and NaN
+    int ir = 0x7fc00000;
+
+    // 2^24 <= |x| < Inf, the result is always even integer
+    ir = ix < 0x7f800000 ? xsgn : ir;
+
+    // 2^23 <= |x| < 2^24, the result is always integer
+    ir = ix < 0x4b800000 ? xodd : ir;
+
+    // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+    // r < 1.0
+    float a = 1.0f - r;
+    int e = 0;
+    int s = xnsgn;
+
+    // r <= 0.75
+    int c = r <= 0.75f;
+    a = c ? r - 0.5f : a;
+    e = c ? 1 : e;
+    s = c ? xsgn : s;
+
+    // r < 0.5
+    c = r < 0.5f;
+    a = c ? 0.5f - r : a;
+    s = c ? xnsgn : s;
+
+    // 0 < r <= 0.25
+    c = r <= 0.25f;
+    a = c ? r : a;
+    e = c ? 0 : e;
+    s = c ? xsgn : s;
+
+    float2 t = tanf_piby4(a * pi);
+    int jr = s ^ as_int(e ? t.hi : t.lo);
+
+    jr = r == 0.5f ? xodd | 0x7f800000 : jr;
+
+    ir = ix < 0x4b000000 ? jr : ir;
+
+#if !defined(SUBNORMALS_SUPPORTED)
+	if(ax <= 0.)
+	{
+		double d = float_uint_to_double(as_uint(x));
+		ir = (double_to_float_uint(d*pi));
+	}
+#endif //SUBNORMALS_SUPPORTED
+
+    return as_float(ir);
+}
+

diff --git a/amd-builtins/math32/tgammaF.cl b/amd-builtins/math32/tgammaF.cl
new file mode 100644
index 0000000..7c76f2c
--- /dev/null
+++ b/amd-builtins/math32/tgammaF.cl

@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+tgamma(float x)
+{
+    const float pi = 3.1415926535897932384626433832795f;
+    float ax = fabs(x);
+    float lg = lgamma(ax);
+    float g = exp(lg);
+
+    if (x < 0.0f)
+    {
+		float z = sinpi(x);
+		g = g * ax * z;
+        g = pi / g;
+        g = g == 0 ? as_float(PINFBITPATT_SP32) : g;
+        g = z == 0 ? as_float(QNANBITPATT_SP32) : g;
+	}
+
+    return g;
+}
+

diff --git a/amd-builtins/math32/truncF.cl b/amd-builtins/math32/truncF.cl
new file mode 100644
index 0000000..5559ed1
--- /dev/null
+++ b/amd-builtins/math32/truncF.cl

@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math32.h"
+
+__attribute__((overloadable, always_inline)) float
+trunc(float x)
+{
+    return __amdil_round_zero_f32(x);
+}

diff --git a/amd-builtins/math32/vexpandF.cl b/amd-builtins/math32/vexpandF.cl
new file mode 100644
index 0000000..9085000
--- /dev/null
+++ b/amd-builtins/math32/vexpandF.cl

@@ -0,0 +1,908 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+__attribute__((overloadable, always_inline, weak)) float16
+frexp(float16 x, int16 *p)
+{
+    float16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+frexp(float16 x, __global int16 *p)
+{
+    float16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+frexp(float16 x, __local int16 *p)
+{
+    float16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float8
+frexp(float8 x, int8 *p)
+{
+    float8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+frexp(float8 x, __global int8 *p)
+{
+    float8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+frexp(float8 x, __local int8 *p)
+{
+    float8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float4
+frexp(float4 x, int4 *p)
+{
+    float4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+frexp(float4 x, __global int4 *p)
+{
+    float4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+frexp(float4 x, __local int4 *p)
+{
+    float4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float3
+frexp(float3 x, int3 *p)
+{
+    float3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = frexp(x.s01, &j);
+    i.s01 = j;
+    r.s2 = frexp(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+frexp(float3 x, __global int3 *p)
+{
+    float3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = frexp(x.s01, &j);
+    i.s01 = j;
+    r.s2 = frexp(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+frexp(float3 x, __local int3 *p)
+{
+    float3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = frexp(x.s01, &j);
+    i.s01 = j;
+    r.s2 = frexp(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float2
+frexp(float2 x, int2 *p)
+{
+    float2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+frexp(float2 x, __global int2 *p)
+{
+    float2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+frexp(float2 x, __local int2 *p)
+{
+    float2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float16
+lgamma_r(float16 x, int16 *p)
+{
+    float16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+lgamma_r(float16 x, __global int16 *p)
+{
+    float16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+lgamma_r(float16 x, __local int16 *p)
+{
+    float16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float8
+lgamma_r(float8 x, int8 *p)
+{
+    float8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+lgamma_r(float8 x, __global int8 *p)
+{
+    float8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+lgamma_r(float8 x, __local int8 *p)
+{
+    float8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float4
+lgamma_r(float4 x, int4 *p)
+{
+    float4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+lgamma_r(float4 x, __global int4 *p)
+{
+    float4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+lgamma_r(float4 x, __local int4 *p)
+{
+    float4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float3
+lgamma_r(float3 x, int3 *p)
+{
+    float3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = lgamma_r(x.s01, &j);
+    i.s01 = j;
+    r.s2 = lgamma_r(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+lgamma_r(float3 x, __global int3 *p)
+{
+    float3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = lgamma_r(x.s01, &j);
+    i.s01 = j;
+    r.s2 = lgamma_r(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+lgamma_r(float3 x, __local int3 *p)
+{
+    float3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = lgamma_r(x.s01, &j);
+    i.s01 = j;
+    r.s2 = lgamma_r(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float2
+lgamma_r(float2 x, int2 *p)
+{
+    float2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+lgamma_r(float2 x, __global int2 *p)
+{
+    float2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+lgamma_r(float2 x, __local int2 *p)
+{
+    float2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float16
+remquo(float16 x, float16 y, int16 *p)
+{
+    float16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+remquo(float16 x, float16 y, __global int16 *p)
+{
+    float16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+remquo(float16 x, float16 y, __local int16 *p)
+{
+    float16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float8
+remquo(float8 x, float8 y, int8 *p)
+{
+    float8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+remquo(float8 x, float8 y, __global int8 *p)
+{
+    float8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+remquo(float8 x, float8 y, __local int8 *p)
+{
+    float8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float4
+remquo(float4 x, float4 y, int4 *p)
+{
+    float4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+remquo(float4 x, float4 y, __global int4 *p)
+{
+    float4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+remquo(float4 x, float4 y, __local int4 *p)
+{
+    float4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float3
+remquo(float3 x, float3 y, int3 *p)
+{
+    float3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = remquo(x.s01, y.s01, &j);
+    i.s01 = j;
+    r.s2 = remquo(x.s2, y.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+remquo(float3 x, float3 y, __global int3 *p)
+{
+    float3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = remquo(x.s01, y.s01, &j);
+    i.s01 = j;
+    r.s2 = remquo(x.s2, y.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+remquo(float3 x, float3 y, __local int3 *p)
+{
+    float3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = remquo(x.s01, y.s01, &j);
+    i.s01 = j;
+    r.s2 = remquo(x.s2, y.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float2
+remquo(float2 x, float2 y, int2 *p)
+{
+    float2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+remquo(float2 x, float2 y, __global int2 *p)
+{
+    float2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+remquo(float2 x, float2 y, __local int2 *p)
+{
+    float2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+

diff --git a/amd-builtins/math32/xvexpandF.cl b/amd-builtins/math32/xvexpandF.cl
new file mode 100644
index 0000000..bffad1d
--- /dev/null
+++ b/amd-builtins/math32/xvexpandF.cl

@@ -0,0 +1,909 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// XXX this file can be removed after clp is implemented
+
+__attribute__((overloadable, always_inline, weak)) float16
+fract(float16 x, float16 *p)
+{
+    float16 r;
+    float16 t;
+    float8 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+fract(float16 x, __global float16 *p)
+{
+    float16 r;
+    float16 t;
+    float8 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+fract(float16 x, __local float16 *p)
+{
+    float16 r;
+    float16 t;
+    float8 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float8
+fract(float8 x, float8 *p)
+{
+    float8 r;
+    float8 t;
+    float4 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+fract(float8 x, __global float8 *p)
+{
+    float8 r;
+    float8 t;
+    float4 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+fract(float8 x, __local float8 *p)
+{
+    float8 r;
+    float8 t;
+    float4 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float4
+fract(float4 x, float4 *p)
+{
+    float4 r;
+    float4 t;
+    float2 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+fract(float4 x, __global float4 *p)
+{
+    float4 r;
+    float4 t;
+    float2 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+fract(float4 x, __local float4 *p)
+{
+    float4 r;
+    float4 t;
+    float2 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float3
+fract(float3 x, float3 *p)
+{
+    float3 r;
+    float3 t;
+    float2 a;
+    float b;
+
+    r.s01 = fract(x.s01, &a);
+    t.s01 = a;
+    r.s2 = fract(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+fract(float3 x, __global float3 *p)
+{
+    float3 r;
+    float3 t;
+    float2 a;
+    float b;
+
+    r.s01 = fract(x.s01, &a);
+    t.s01 = a;
+    r.s2 = fract(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+fract(float3 x, __local float3 *p)
+{
+    float3 r;
+    float3 t;
+    float2 a;
+    float b;
+
+    r.s01 = fract(x.s01, &a);
+    t.s01 = a;
+    r.s2 = fract(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float2
+fract(float2 x, float2 *p)
+{
+    float2 r;
+    float2 t;
+    float a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+fract(float2 x, __global float2 *p)
+{
+    float2 r;
+    float2 t;
+    float a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+fract(float2 x, __local float2 *p)
+{
+    float2 r;
+    float2 t;
+    float a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float16
+modf(float16 x, float16 *p)
+{
+    float16 r;
+    float16 t;
+    float8 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+modf(float16 x, __global float16 *p)
+{
+    float16 r;
+    float16 t;
+    float8 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+modf(float16 x, __local float16 *p)
+{
+    float16 r;
+    float16 t;
+    float8 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float8
+modf(float8 x, float8 *p)
+{
+    float8 r;
+    float8 t;
+    float4 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+modf(float8 x, __global float8 *p)
+{
+    float8 r;
+    float8 t;
+    float4 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+modf(float8 x, __local float8 *p)
+{
+    float8 r;
+    float8 t;
+    float4 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float4
+modf(float4 x, float4 *p)
+{
+    float4 r;
+    float4 t;
+    float2 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+modf(float4 x, __global float4 *p)
+{
+    float4 r;
+    float4 t;
+    float2 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+modf(float4 x, __local float4 *p)
+{
+    float4 r;
+    float4 t;
+    float2 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float3
+modf(float3 x, float3 *p)
+{
+    float3 r;
+    float3 t;
+    float2 a;
+    float b;
+
+    r.s01 = modf(x.s01, &a);
+    t.s01 = a;
+    r.s2 = modf(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+modf(float3 x, __global float3 *p)
+{
+    float3 r;
+    float3 t;
+    float2 a;
+    float b;
+
+    r.s01 = modf(x.s01, &a);
+    t.s01 = a;
+    r.s2 = modf(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+modf(float3 x, __local float3 *p)
+{
+    float3 r;
+    float3 t;
+    float2 a;
+    float b;
+
+    r.s01 = modf(x.s01, &a);
+    t.s01 = a;
+    r.s2 = modf(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float2
+modf(float2 x, float2 *p)
+{
+    float2 r;
+    float2 t;
+    float a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+modf(float2 x, __global float2 *p)
+{
+    float2 r;
+    float2 t;
+    float a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+modf(float2 x, __local float2 *p)
+{
+    float2 r;
+    float2 t;
+    float a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float16
+sincos(float16 x, float16 *p)
+{
+    float16 r;
+    float16 t;
+    float8 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+sincos(float16 x, __global float16 *p)
+{
+    float16 r;
+    float16 t;
+    float8 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float16
+sincos(float16 x, __local float16 *p)
+{
+    float16 r;
+    float16 t;
+    float8 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float8
+sincos(float8 x, float8 *p)
+{
+    float8 r;
+    float8 t;
+    float4 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+sincos(float8 x, __global float8 *p)
+{
+    float8 r;
+    float8 t;
+    float4 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float8
+sincos(float8 x, __local float8 *p)
+{
+    float8 r;
+    float8 t;
+    float4 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float4
+sincos(float4 x, float4 *p)
+{
+    float4 r;
+    float4 t;
+    float2 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+sincos(float4 x, __global float4 *p)
+{
+    float4 r;
+    float4 t;
+    float2 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float4
+sincos(float4 x, __local float4 *p)
+{
+    float4 r;
+    float4 t;
+    float2 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float3
+sincos(float3 x, float3 *p)
+{
+    float3 r;
+    float3 t;
+    float2 a;
+    float b;
+
+    r.s01 = sincos(x.s01, &a);
+    t.s01 = a;
+    r.s2 = sincos(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+sincos(float3 x, __global float3 *p)
+{
+    float3 r;
+    float3 t;
+    float2 a;
+    float b;
+
+    r.s01 = sincos(x.s01, &a);
+    t.s01 = a;
+    r.s2 = sincos(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float3
+sincos(float3 x, __local float3 *p)
+{
+    float3 r;
+    float3 t;
+    float2 a;
+    float b;
+
+    r.s01 = sincos(x.s01, &a);
+    t.s01 = a;
+    r.s2 = sincos(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) float2
+sincos(float2 x, float2 *p)
+{
+    float2 r;
+    float2 t;
+    float a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+sincos(float2 x, __global float2 *p)
+{
+    float2 r;
+    float2 t;
+    float a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) float2
+sincos(float2 x, __local float2 *p)
+{
+    float2 r;
+    float2 t;
+    float a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+

diff --git a/amd-builtins/math64/acosD.cl b/amd-builtins/math64/acosD.cl
new file mode 100644
index 0000000..c652ba8
--- /dev/null
+++ b/amd-builtins/math64/acosD.cl

@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+acos(double x)
+{
+    // Computes arccos(x).
+    // The argument is first reduced by noting that arccos(x)
+    // is invalid for abs(x) > 1. For denormal and small
+    // arguments arccos(x) = pi/2 to machine accuracy.
+    // Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arccos(x) = pi/2 - arcsin(x)
+    // = pi/2 - (x + x^3*R(x^2))
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+    const double pi = 3.1415926535897933e+00;             /* 0x400921fb54442d18 */
+    const double piby2 = 1.5707963267948965580e+00;       /* 0x3ff921fb54442d18 */
+    const double piby2_head = 1.5707963267948965580e+00;  /* 0x3ff921fb54442d18 */
+    const double piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */
+
+    double y = fabs(x);
+    int xneg = as_int2(x).hi < 0;
+    int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64;
+
+    // abs(x) >= 0.5
+    int transform = xexp >= -1;
+
+    double rt = 0.5 * (1.0 - y);
+    double y2 = y * y;
+    double r = transform ? rt : y2;
+
+    // Use a rational approximation for [0.0, 0.5]
+    double un = fma(r,
+                    fma(r,
+                        fma(r,
+                            fma(r,
+                                fma(r, 0.0000482901920344786991880522822991,
+                                       0.00109242697235074662306043804220),
+                                -0.0549989809235685841612020091328),
+                            0.275558175256937652532686256258),
+                        -0.445017216867635649900123110649),
+                    0.227485835556935010735943483075);
+
+    double ud = fma(r,
+                    fma(r,
+                        fma(r,
+                            fma(r, 0.105869422087204370341222318533,
+                                   -0.943639137032492685763471240072),
+                            2.76568859157270989520376345954),
+                        -3.28431505720958658909889444194),
+                    1.36491501334161032038194214209);
+
+    double u = r * MATH_DIVIDE(un, ud);
+
+    // Reconstruct acos carefully in transformed region
+    double s = sqrt(r);
+    double ztn =  fma(-2.0, (s + fma(s, u, -piby2_tail)), pi);
+
+    double s1 = as_double(as_ulong(s) & 0xffffffff00000000UL);
+    double c = MATH_DIVIDE(fma(-s1, s1, r), s + s1);
+    double ztp = 2.0 * (s1 + fma(s, u, c));
+    double zt =  xneg ? ztn : ztp;
+    double z = piby2_head - (x - fma(-x, u, piby2_tail));
+
+    z =  transform ? zt : z;
+
+    z = xexp < -56 ? piby2 : z;
+/*    z = xexp >= 0 ? as_double(QNANBITPATT_DP64) : z; This check  for nan is not working */
+	z = isnan(x) ? as_double((as_ulong(x) | QNANBITPATT_DP64)) : z;
+    z = x == 1.0 ? 0.0 : z;
+    z = x == -1.0 ? pi : z;
+
+    return z;
+}
+

diff --git a/amd-builtins/math64/acoshD.cl b/amd-builtins/math64/acoshD.cl
new file mode 100644
index 0000000..5c844d4
--- /dev/null
+++ b/amd-builtins/math64/acoshD.cl

@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+#include "ep_logD.h"
+
+__attribute__((overloadable)) double
+acosh(double x)
+{
+    const double recrteps = 0x1.6a09e667f3bcdp+26;	// 1/sqrt(eps) = 9.49062656242515593767e+07
+    //log2_lead and log2_tail sum to an extra-precise version of log(2)
+    const double log2_lead = 0x1.62e42ep-1;
+    const double log2_tail = 0x1.efa39ef35793cp-25;
+
+    // Handle x >= 128 here
+    int xlarge = x > recrteps;
+    double r = x + sqrt(fma(x, x, -1.0));
+    r = xlarge ? x : r;
+
+    int xexp;
+    double r1, r2;
+    ep_log(r, &xexp, &r1, &r2);
+
+    double dxexp = xexp + xlarge;
+    r1 = fma(dxexp, log2_lead, r1);
+    r2 = fma(dxexp, log2_tail, r2);
+
+    double ret1 = r1 + r2;
+
+    // Handle 1 < x < 128 here
+    // We compute the value
+    // t = x - 1.0 + sqrt(2.0*(x - 1.0) + (x - 1.0)*(x - 1.0))
+    // using simulated quad precision.
+    double t = x - 1.0;
+    double u1 = t * 2.0;
+
+    // (t,0) * (t,0) -> (v1, v2)
+    double v1 = t * t;
+    double v2 = fma(t, t, -v1);
+
+    // (u1,0) + (v1,v2) -> (w1,w2)
+    r = u1 + v1;
+    double s = (((u1 - r) + v1) + v2);
+    double w1 = r + s;
+    double w2 = (r - w1) + s;
+
+    // sqrt(w1,w2) -> (u1,u2)
+    double p1 = sqrt(w1);
+    double a1 = p1*p1;
+    double a2 = fma(p1, p1, -a1);
+    double temp = (((w1 - a1) - a2) + w2);
+    double p2 = MATH_DIVIDE(temp * 0.5, p1);
+    u1 = p1 + p2;
+    double u2 = (p1 - u1) + p2;
+
+    // (u1,u2) + (t,0) -> (r1,r2)
+    r = u1 + t;
+    s = ((u1 - r) + t) + u2;
+    // r1 = r + s;
+    // r2 = (r - r1) + s;
+    // t = r1 + r2;
+    t = r + s;
+
+    // For arguments 1.13 <= x <= 1.5 the log1p function is good enough
+    double ret2 = log1p(t);
+
+    ulong ux = as_ulong(x);
+    double ret = x >= 128.0 ? ret1 : ret2;
+
+    ret = ux >= 0x7FF0000000000000 ? x : ret;
+    ret = x == 1.0 ? 0.0 : ret;
+    ret = (ux & SIGNBIT_DP64) != 0UL | x < 1.0 ? as_double(QNANBITPATT_DP64) : ret;
+
+    return ret;
+}

diff --git a/amd-builtins/math64/acospiD.cl b/amd-builtins/math64/acospiD.cl
new file mode 100644
index 0000000..e8fc7ea
--- /dev/null
+++ b/amd-builtins/math64/acospiD.cl

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+acospi(double x)
+{
+    // Computes arccos(x).
+    // The argument is first reduced by noting that arccos(x)
+    // is invalid for abs(x) > 1. For denormal and small
+    // arguments arccos(x) = pi/2 to machine accuracy.
+    // Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arccos(x) = pi/2 - arcsin(x)
+    // = pi/2 - (x + x^3*R(x^2))
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+    const double pi = 0x1.921fb54442d18p+1;
+    const double piby2_tail = 6.12323399573676603587e-17;        /* 0x3c91a62633145c07 */
+
+    double y = fabs(x);
+    int xneg = as_int2(x).hi < 0;
+    int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64;
+
+    // abs(x) >= 0.5
+    int transform = xexp >= -1;
+
+    // Transform y into the range [0,0.5)
+    double r1 = 0.5 * (1.0 - y);
+    double s = sqrt(r1);
+    double r = y * y;
+    r = transform ? r1 : r;
+    y = transform ? s : y;
+
+    // Use a rational approximation for [0.0, 0.5]
+    double un = fma(r,
+                    fma(r,
+                        fma(r,
+                            fma(r,
+                                fma(r, 0.0000482901920344786991880522822991,
+                                       0.00109242697235074662306043804220),
+                                -0.0549989809235685841612020091328),
+                            0.275558175256937652532686256258),
+                        -0.445017216867635649900123110649),
+                    0.227485835556935010735943483075);
+
+    double ud = fma(r,
+                    fma(r,
+                        fma(r,
+                            fma(r, 0.105869422087204370341222318533, 
+                                   -0.943639137032492685763471240072),
+                            2.76568859157270989520376345954),
+                        -3.28431505720958658909889444194),
+                    1.36491501334161032038194214209);
+
+    double u = r * MATH_DIVIDE(un, ud);
+
+    // Reconstruct acos carefully in transformed region
+    double res1 = fma(-2.0, MATH_DIVIDE(s + fma(y, u, -piby2_tail), pi), 1.0);
+    double s1 = as_double(as_ulong(s) & 0xffffffff00000000UL);
+    double c = MATH_DIVIDE(fma(-s1, s1, r), s + s1);
+    double res2 = MATH_DIVIDE(fma(2.0, s1, fma(2.0, c, 2.0 * y * u)), pi);
+    res1 = xneg ? res1 : res2;
+    res2 = 0.5 - fma(x, u, x) / pi;
+    res1 = transform ? res1 : res2;
+
+    const double qnan = as_double(QNANBITPATT_DP64);
+    res2 = x == 1.0 ? 0.0 : qnan;
+    res2 = x == -1.0 ? 1.0 : res2;
+    res1 = xexp >= 0 ? res2 : res1;
+    res1 = xexp < -56 ? 0.5 : res1;
+
+    return res1;
+}
+

diff --git a/amd-builtins/math64/asinD.cl b/amd-builtins/math64/asinD.cl
new file mode 100644
index 0000000..cc81114
--- /dev/null
+++ b/amd-builtins/math64/asinD.cl

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+asin(double x)
+{
+    // Computes arcsin(x).
+    // The argument is first reduced by noting that arcsin(x)
+    // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
+    // For denormal and small arguments arcsin(x) = x to machine
+    // accuracy. Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arcsin(x) = x + x^3*R(x^2)
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+    const double piby2_tail = 6.1232339957367660e-17;  /* 0x3c91a62633145c07 */
+    const double hpiby2_head = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */
+    const double piby2 = 1.5707963267948965e+00;       /* 0x3ff921fb54442d18 */
+
+    double y = fabs(x);
+    int xneg = as_int2(x).hi < 0;
+    int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64;
+
+    // abs(x) >= 0.5
+    int transform = xexp >= -1;
+
+    double rt = 0.5 * (1.0 - y);
+    double y2 = y * y;
+    double r = transform ? rt : y2;
+
+    // Use a rational approximation for [0.0, 0.5]
+
+    double un = fma(r,
+                    fma(r,
+                        fma(r,
+                            fma(r,
+                                fma(r, 0.0000482901920344786991880522822991,
+                                       0.00109242697235074662306043804220),
+                                -0.0549989809235685841612020091328),
+                            0.275558175256937652532686256258),
+                        -0.445017216867635649900123110649),
+                    0.227485835556935010735943483075);
+
+    double ud = fma(r,
+                    fma(r,
+                        fma(r,
+                            fma(r, 0.105869422087204370341222318533,
+                                   -0.943639137032492685763471240072),
+                            2.76568859157270989520376345954),
+                        -3.28431505720958658909889444194),
+                    1.36491501334161032038194214209);
+
+    double u = r * MATH_DIVIDE(un, ud);
+
+    // Reconstruct asin carefully in transformed region
+    double s = sqrt(r);
+    double sh = as_double(as_ulong(s) & 0xffffffff00000000UL);
+    double c = MATH_DIVIDE(fma(-sh, sh, r), s + sh);
+    double p = fma(2.0*s, u, -fma(-2.0, c, piby2_tail));
+    double q = fma(-2.0, sh, hpiby2_head);
+    double vt = hpiby2_head - (p - q);
+    double v = fma(y, u, y);
+    v = transform ? vt : v;
+
+    v = xexp < -28 ? y : v;
+    v = xexp >= 0 ? as_double(QNANBITPATT_DP64) : v;
+    v = y == 1.0 ? piby2 : v;
+
+    return xneg ? -v : v;
+}
+

diff --git a/amd-builtins/math64/asinhD.cl b/amd-builtins/math64/asinhD.cl
new file mode 100644
index 0000000..e856419
--- /dev/null
+++ b/amd-builtins/math64/asinhD.cl

@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+#include "ep_logD.h"
+
+#define NA0 -0.12845379283524906084997e0
+#define NA1 -0.21060688498409799700819e0
+#define NA2 -0.10188951822578188309186e0
+#define NA3 -0.13891765817243625541799e-1
+#define NA4 -0.10324604871728082428024e-3
+
+#define DA0  0.77072275701149440164511e0
+#define DA1  0.16104665505597338100747e1
+#define DA2  0.11296034614816689554875e1
+#define DA3  0.30079351943799465092429e0
+#define DA4  0.235224464765951442265117e-1
+
+#define NB0 -0.12186605129448852495563e0
+#define NB1 -0.19777978436593069928318e0
+#define NB2 -0.94379072395062374824320e-1
+#define NB3 -0.12620141363821680162036e-1
+#define NB4 -0.903396794842691998748349e-4
+
+#define DB0  0.73119630776696495279434e0
+#define DB1  0.15157170446881616648338e1
+#define DB2  0.10524909506981282725413e1
+#define DB3  0.27663713103600182193817e0
+#define DB4  0.21263492900663656707646e-1
+
+#define NC0 -0.81210026327726247622500e-1
+#define NC1 -0.12327355080668808750232e0
+#define NC2 -0.53704925162784720405664e-1
+#define NC3 -0.63106739048128554465450e-2
+#define NC4 -0.35326896180771371053534e-4
+
+#define DC0  0.48726015805581794231182e0
+#define DC1  0.95890837357081041150936e0
+#define DC2  0.62322223426940387752480e0
+#define DC3  0.15028684818508081155141e0
+#define DC4  0.10302171620320141529445e-1
+
+#define ND0 -0.4638179204422665073e-1
+#define ND1 -0.7162729496035415183e-1
+#define ND2 -0.3247795155696775148e-1
+#define ND3 -0.4225785421291932164e-2
+#define ND4 -0.3808984717603160127e-4
+#define ND5  0.8023464184964125826e-6
+
+#define DD0  0.2782907534642231184e0
+#define DD1  0.5549945896829343308e0
+#define DD2  0.3700732511330698879e0
+#define DD3  0.9395783438240780722e-1
+#define DD4  0.7200057974217143034e-2
+
+#define NE0 -0.121224194072430701e-4
+#define NE1 -0.273145455834305218e-3
+#define NE2 -0.152866982560895737e-2
+#define NE3 -0.292231744584913045e-2
+#define NE4 -0.174670900236060220e-2
+#define NE5 -0.891754209521081538e-12
+
+#define DE0  0.499426632161317606e-4
+#define DE1  0.139591210395547054e-2
+#define DE2  0.107665231109108629e-1
+#define DE3  0.325809818749873406e-1
+#define DE4  0.415222526655158363e-1
+#define DE5  0.186315628774716763e-1
+
+#define NF0  -0.195436610112717345e-4
+#define NF1  -0.233315515113382977e-3
+#define NF2  -0.645380957611087587e-3
+#define NF3  -0.478948863920281252e-3
+#define NF4  -0.805234112224091742e-12
+#define NF5   0.246428598194879283e-13
+
+#define DF0   0.822166621698664729e-4
+#define DF1   0.135346265620413852e-2
+#define DF2   0.602739242861830658e-2
+#define DF3   0.972227795510722956e-2
+#define DF4   0.510878800983771167e-2
+
+#define NG0  -0.209689451648100728e-6
+#define NG1  -0.219252358028695992e-5
+#define NG2  -0.551641756327550939e-5
+#define NG3  -0.382300259826830258e-5
+#define NG4  -0.421182121910667329e-17
+#define NG5   0.492236019998237684e-19
+
+#define DG0   0.889178444424237735e-6
+#define DG1   0.131152171690011152e-4
+#define DG2   0.537955850185616847e-4
+#define DG3   0.814966175170941864e-4
+#define DG4   0.407786943832260752e-4
+
+#define NH0  -0.178284193496441400e-6
+#define NH1  -0.928734186616614974e-6
+#define NH2  -0.923318925566302615e-6
+#define NH3  -0.776417026702577552e-19
+#define NH4   0.290845644810826014e-21
+
+#define DH0   0.786694697277890964e-6
+#define DH1   0.685435665630965488e-5
+#define DH2   0.153780175436788329e-4
+#define DH3   0.984873520613417917e-5
+
+#define NI0  -0.538003743384069117e-10
+#define NI1  -0.273698654196756169e-9
+#define NI2  -0.268129826956403568e-9
+#define NI3  -0.804163374628432850e-29
+
+#define DI0   0.238083376363471960e-9
+#define DI1   0.203579344621125934e-8
+#define DI2   0.450836980450693209e-8
+#define DI3   0.286005148753497156e-8
+
+__attribute__((overloadable)) double
+asinh(double x)
+{
+    const double rteps = 0x1.6a09e667f3bcdp-27;
+    const double recrteps = 0x1.6a09e667f3bcdp+26;
+
+    // log2_lead and log2_tail sum to an extra-precise version of log(2)
+    const double log2_lead = 0x1.62e42ep-1;
+    const double log2_tail = 0x1.efa39ef35793cp-25;
+
+    ulong ux = as_ulong(x);
+    ulong ax = ux & ~SIGNBIT_DP64;
+    double absx = as_double(ax);
+
+    double t = x * x;
+    double pn, tn, pd, td;
+
+    // XXX we are betting here that we can evaluate 8 pairs of
+    // polys faster than we can grab 12 coefficients from a table
+    // This also uses fewer registers
+
+    // |x| >= 8
+    pn = fma(t, fma(t, fma(t, NI3, NI2), NI1), NI0);
+    pd = fma(t, fma(t, fma(t, DI3, DI2), DI1), DI0);
+
+    tn = fma(t, fma(t, fma(t, fma(t, NH4, NH3), NH2), NH1), NH0);
+    td = fma(t, fma(t, fma(t, DH3, DH2), DH1), DH0);
+    pn = absx < 8.0 ? tn : pn;
+    pd = absx < 8.0 ? td : pd;
+
+    tn = fma(t, fma(t, fma(t, fma(t, fma(t, NG5, NG4), NG3), NG2), NG1), NG0);
+    td = fma(t, fma(t, fma(t, fma(t, DG4, DG3), DG2), DG1), DG0);
+    pn = absx < 4.0 ? tn : pn;
+    pd = absx < 4.0 ? td : pd;
+
+    tn = fma(t, fma(t, fma(t, fma(t, fma(t, NF5, NF4), NF3), NF2), NF1), NF0);
+    td = fma(t, fma(t, fma(t, fma(t, DF4, DF3), DF2), DF1), DF0);
+    pn = absx < 2.0 ? tn : pn;
+    pd = absx < 2.0 ? td : pd;
+
+    tn = fma(t, fma(t, fma(t, fma(t, fma(t, NE5, NE4), NE3), NE2), NE1), NE0);
+    td = fma(t, fma(t, fma(t, fma(t, fma(t, DE5, DE4), DE3), DE2), DE1), DE0);
+    pn = absx < 1.5 ? tn : pn;
+    pd = absx < 1.5 ? td : pd;
+
+    tn = fma(t, fma(t, fma(t, fma(t, fma(t, ND5, ND4), ND3), ND2), ND1), ND0);
+    td = fma(t, fma(t, fma(t, fma(t, DD4, DD3), DD2), DD1), DD0);
+    pn = absx <= 1.0 ? tn : pn;
+    pd = absx <= 1.0 ? td : pd;
+
+    tn = fma(t, fma(t, fma(t, fma(t, NC4, NC3), NC2), NC1), NC0);
+    td = fma(t, fma(t, fma(t, fma(t, DC4, DC3), DC2), DC1), DC0);
+    pn = absx < 0.75 ? tn : pn;
+    pd = absx < 0.75 ? td : pd;
+
+    tn = fma(t, fma(t, fma(t, fma(t, NB4, NB3), NB2), NB1), NB0);
+    td = fma(t, fma(t, fma(t, fma(t, DB4, DB3), DB2), DB1), DB0);
+    pn = absx < 0.5 ? tn : pn;
+    pd = absx < 0.5 ? td : pd;
+
+    tn = fma(t, fma(t, fma(t, fma(t, NA4, NA3), NA2), NA1), NA0);
+    td = fma(t, fma(t, fma(t, fma(t, DA4, DA3), DA2), DA1), DA0);
+    pn = absx < 0.25 ? tn : pn;
+    pd = absx < 0.25 ? td : pd;
+
+    double pq = MATH_DIVIDE(pn, pd);
+
+    // |x| <= 1
+    double result1 = fma(absx*t, pq, absx);
+
+    // Other ranges
+    int xout = absx <= 32.0 | absx > recrteps;
+    double y = absx + sqrt(fma(absx, absx, 1.0));
+    y = xout ? absx : y;
+
+    double r1, r2;
+    int xexp;
+    ep_log(y, &xexp, &r1, &r2);
+
+    double dxexp = (double)(xexp + xout);
+    r1 = fma(dxexp, log2_lead, r1);
+    r2 = fma(dxexp, log2_tail, r2);
+
+    // 1 < x <= 32
+    double v2 = (pq + 0.25) / t;
+    double r = v2 + r1;
+    double s = ((r1 - r) + v2) + r2;
+    double v1 = r + s;
+    v2 = (r - v1) + s;
+    double result2 = v1 + v2;
+
+    // x > 32
+    double result3 = r1 + r2;
+
+    double ret = absx > 1.0 ? result2 : result1;
+    ret = absx > 32.0 ? result3 : ret;
+    ret = x < 0.0 ? -ret : ret;
+
+    // NaN, +-Inf, or x small enough that asinh(x) = x
+    ret = ax >= PINFBITPATT_DP64 | absx < rteps ? x : ret;
+    return ret;
+}
+

diff --git a/amd-builtins/math64/asinpiD.cl b/amd-builtins/math64/asinpiD.cl
new file mode 100644
index 0000000..70bf22c
--- /dev/null
+++ b/amd-builtins/math64/asinpiD.cl

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+asinpi(double x)
+{
+    // Computes arcsin(x).
+    // The argument is first reduced by noting that arcsin(x)
+    // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
+    // For denormal and small arguments arcsin(x) = x to machine
+    // accuracy. Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arcsin(x) = x + x^3*R(x^2)
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+    const double pi = 0x1.921fb54442d18p+1;
+    const double piby2_tail = 6.1232339957367660e-17;	/* 0x3c91a62633145c07 */
+    const double hpiby2_head = 7.8539816339744831e-01;	/* 0x3fe921fb54442d18 */
+
+    double y = fabs(x);
+    int xneg = as_int2(x).hi < 0;
+    int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64;
+
+    // abs(x) >= 0.5
+    int transform = xexp >= -1;
+
+    double rt = 0.5 * (1.0 - y);
+    double y2 = y * y;
+    double r = transform ? rt : y2;
+
+    // Use a rational approximation for [0.0, 0.5]
+    double un = fma(r,
+                    fma(r,
+                        fma(r,
+                            fma(r,
+                                fma(r, 0.0000482901920344786991880522822991,
+                                       0.00109242697235074662306043804220),
+                                -0.0549989809235685841612020091328),
+                            0.275558175256937652532686256258),
+                        -0.445017216867635649900123110649),
+                    0.227485835556935010735943483075);
+
+    double ud = fma(r,
+                    fma(r,
+                        fma(r,
+                            fma(r, 0.105869422087204370341222318533,
+                                   -0.943639137032492685763471240072),
+                            2.76568859157270989520376345954),
+                        -3.28431505720958658909889444194),
+                    1.36491501334161032038194214209);
+
+    double u = r * MATH_DIVIDE(un, ud);
+
+
+    // Reconstruct asin carefully in transformed region
+    double s = sqrt(r);
+    double sh = as_double(as_ulong(s) & 0xffffffff00000000UL);
+    double c = MATH_DIVIDE(fma(-sh, sh, r), s + sh);
+    double p = fma(2.0*s, u, -fma(-2.0, c, piby2_tail));
+    double q = fma(-2.0, sh, hpiby2_head);
+    double vt = hpiby2_head - (p - q);
+    double v = fma(y, u, y);
+    v = transform ? vt : v;
+
+    v = xexp < -28 ? y : v;
+    v = MATH_DIVIDE(v, pi);
+    v = xexp >= 0 ? as_double(QNANBITPATT_DP64) : v;
+    v = y == 1.0 ? 0.5 : v;
+    return xneg ? -v : v;
+}
+

diff --git a/amd-builtins/math64/atan2D.cl b/amd-builtins/math64/atan2D.cl
new file mode 100644
index 0000000..5359780
--- /dev/null
+++ b/amd-builtins/math64/atan2D.cl

@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline, weak)) double
+atan2(double y, double x)
+{
+    USE_TABLE(double2, atan_jby256_tbl, ATAN_JBY256_TBL);
+
+    const double pi = 3.1415926535897932e+00;          /* 0x400921fb54442d18 */
+    const double piby2 = 1.5707963267948966e+00;       /* 0x3ff921fb54442d18 */
+    const double piby4 = 7.8539816339744831e-01;       /* 0x3fe921fb54442d18 */
+    const double three_piby4 = 2.3561944901923449e+00; /* 0x4002d97c7f3321d2 */
+    const double pi_head = 3.1415926218032836e+00;     /* 0x400921fb50000000 */
+    const double pi_tail = 3.1786509547056392e-08;     /* 0x3e6110b4611a6263 */
+    const double piby2_head = 1.5707963267948965e+00;  /* 0x3ff921fb54442d18 */
+    const double piby2_tail = 6.1232339957367660e-17;  /* 0x3c91a62633145c07 */
+
+    double x2 = x;
+    int xneg = as_int2(x).hi < 0;
+    int xexp = (as_int2(x).hi >> 20) & 0x7ff;
+
+    double y2 = y;
+    int yneg = as_int2(y).hi < 0;
+    int yexp = (as_int2(y).hi >> 20) & 0x7ff;
+
+    int cond2 = (xexp < 1021) & (yexp < 1021);
+    int diffexp = yexp - xexp;
+
+    // Scale up both x and y if they are both below 1/4
+    double x1 = ldexp(x, 1024);
+    int xexp1 = (as_int2(x1).hi >> 20) & 0x7ff;
+    double y1 = ldexp(y, 1024);
+    int yexp1 = (as_int2(y1).hi >> 20) & 0x7ff;
+    int diffexp1 = yexp1 - xexp1;
+
+    diffexp = cond2 ? diffexp1 : diffexp;
+    x = cond2 ? x1 : x;
+    y = cond2 ? y1 : y;
+
+    // General case: take absolute values of arguments
+    double u = fabs(x);
+    double v = fabs(y);
+
+    // Swap u and v if necessary to obtain 0 < v < u. Compute v/u.
+    int swap_vu = u < v;
+    double uu = u;
+    u = swap_vu ? v : u;
+    v = swap_vu ? uu : v;
+
+    double vbyu = v / u;
+    double q1, q2;
+
+    // General values of v/u. Use a look-up table and series expansion.
+
+    {
+        double val = vbyu > 0.0625 ? vbyu : 0.063;
+        int index = convert_int(fma(256.0, val, 0.5));
+	double2 tv = atan_jby256_tbl[index - 16];
+	q1 = tv.s0;
+	q2 = tv.s1;
+        double c = (double)index * 0x1.0p-8;
+
+        // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1
+        // u_exponent could be EMAX so we have to do it in 2 steps
+        int m = -((int)(as_ulong(u) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
+	//double um = __amdil_ldexp_f64(u, m);
+	//double vm = __amdil_ldexp_f64(v, m);
+	double um = ldexp(u, m);
+	double vm = ldexp(v, m);
+
+        // 26 leading bits of u
+        double u1 = as_double(as_ulong(um) & 0xfffffffff8000000UL);
+        double u2 = um - u1;
+
+        double r = MATH_DIVIDE(fma(-c, u2, fma(-c, u1, vm)), fma(c, vm, um));
+
+        // Polynomial approximation to atan(r)
+        double s = r * r;
+        q2 = q2 + fma((s * fma(-s, 0.19999918038989143496, 0.33333333333224095522)), -r, r);
+    }
+
+
+    double q3, q4;
+    {
+        q3 = 0.0;
+        q4 = vbyu;
+    }
+
+    double q5, q6;
+    {
+        double u1 = as_double(as_ulong(u) & 0xffffffff00000000UL);
+        double u2 = u - u1;
+        double vu1 = as_double(as_ulong(vbyu) & 0xffffffff00000000UL);
+        double vu2 = vbyu - vu1;
+
+        q5 = 0.0;
+        double s = vbyu * vbyu;
+        q6 = vbyu + fma(-vbyu * s,
+                        fma(-s,
+                            fma(-s,
+                                fma(-s,
+                                    fma(-s, 0.90029810285449784439E-01,
+                                        0.11110736283514525407),
+                                    0.14285713561807169030),
+                                0.19999999999393223405),
+                            0.33333333333333170500),
+			 MATH_DIVIDE(fma(-u, vu2, fma(-u2, vu1, fma(-u1, vu1, v))), u));
+    }
+
+
+    q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5;
+    q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6;
+
+    q1 = vbyu > 0.0625 ? q1 : q3;
+    q2 = vbyu > 0.0625 ? q2 : q4;
+
+    // Tidy-up according to which quadrant the arguments lie in
+    double res1, res2, res3, res4;
+    q1 = swap_vu ? piby2_head - q1 : q1;
+    q2 = swap_vu ? piby2_tail - q2 : q2;
+    q1 = xneg ? pi_head - q1 : q1;
+    q2 = xneg ? pi_tail - q2 : q2;
+    q1 = q1 + q2;
+    res4 = yneg ? -q1 : q1;
+
+    res1 = yneg ? -three_piby4 : three_piby4;
+    res2 = yneg ? -piby4 : piby4;
+    res3 = xneg ? res1 : res2;
+
+    res3 = isinf(x2) & isinf(y2) ? res3 : res4;
+    res1 = yneg ? -pi : pi;
+
+    // abs(x)/abs(y) > 2^56 and x < 0
+    res3 = (diffexp < -56 && xneg) ? res1 : res3;
+
+    res4 = MATH_DIVIDE(y, x);
+    // x positive and dominant over y by a factor of 2^28
+    res3 = diffexp < -28 & xneg == 0 ? res4 : res3;
+
+    // abs(y)/abs(x) > 2^56
+    res4 = yneg ? -piby2 : piby2;       // atan(y/x) is insignificant compared to piby2
+    res3 = diffexp > 56 ? res4 : res3;
+
+    res3 = x2 == 0.0 ? res4 : res3;   // Zero x gives +- pi/2 depending on sign of y
+    res4 = xneg ? res1 : y2;
+
+    res3 = y2 == 0.0 ? res4 : res3;   // Zero y gives +-0 for positive x and +-pi for negative x
+    res3 = isnan(y2) ? y2 : res3;
+    res3 = isnan(x2) ? x2 : res3;
+
+    return res3;
+}
+

diff --git a/amd-builtins/math64/atan2D_table.h b/amd-builtins/math64/atan2D_table.h
new file mode 100644
index 0000000..c37177c
--- /dev/null
+++ b/amd-builtins/math64/atan2D_table.h

@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// Arrays atan_jby256_lead and atan_jby256_tail contain
+// leading and trailing parts respectively of precomputed
+// values of atan(j/256), for j = 16, 17, ..., 256.
+// atan_jby256_lead contains the first 21 bits of precision,
+// and atan_jby256_tail contains a further 53 bits precision.
+
+DECLARE_TABLE(double2, ATAN_JBY256_TBL, 241,
+    (double2)(0x1.ff55b00000000p-5, 0x1.6e59fbd38db2cp-26),
+    (double2)(0x1.0f99e00000000p-4, 0x1.4e3aa54dedf96p-25),
+    (double2)(0x1.1f86d00000000p-4, 0x1.7e105ab1bda88p-25),
+    (double2)(0x1.2f71900000000p-4, 0x1.8c5254d013fd0p-27),
+    (double2)(0x1.3f59f00000000p-4, 0x1.cf8ab3ad62670p-29),
+    (double2)(0x1.4f3fd00000000p-4, 0x1.9dca4bec80468p-26),
+    (double2)(0x1.5f23200000000p-4, 0x1.3f4b5ec98a8dap-26),
+    (double2)(0x1.6f03b00000000p-4, 0x1.b9d49619d81fep-25),
+    (double2)(0x1.7ee1800000000p-4, 0x1.3017887460934p-27),
+    (double2)(0x1.8ebc500000000p-4, 0x1.11e3eca0b9944p-26),
+    (double2)(0x1.9e94100000000p-4, 0x1.4f3f73c5a332ep-26),
+    (double2)(0x1.ae68a00000000p-4, 0x1.c71c8ae0e00a6p-26),
+    (double2)(0x1.be39e00000000p-4, 0x1.7cde0f86fbdc7p-25),
+    (double2)(0x1.ce07c00000000p-4, 0x1.70f328c889c72p-26),
+    (double2)(0x1.ddd2100000000p-4, 0x1.c07ae9b994efep-26),
+    (double2)(0x1.ed98c00000000p-4, 0x1.0c8021d7b1698p-27),
+    (double2)(0x1.fd5ba00000000p-4, 0x1.35585edb8cb22p-25),
+    (double2)(0x1.068d500000000p-3, 0x1.0842567b30e96p-24),
+    (double2)(0x1.0e6ad00000000p-3, 0x1.99e811031472ep-24),
+    (double2)(0x1.1646500000000p-3, 0x1.041821416bceep-25),
+    (double2)(0x1.1e1fa00000000p-3, 0x1.f6086e4dc96f4p-24),
+    (double2)(0x1.25f6e00000000p-3, 0x1.71a535c5f1b58p-27),
+    (double2)(0x1.2dcbd00000000p-3, 0x1.65f743fe63ca1p-24),
+    (double2)(0x1.359e800000000p-3, 0x1.dbd733472d014p-24),
+    (double2)(0x1.3d6ee00000000p-3, 0x1.d18cc4d8b0d1dp-24),
+    (double2)(0x1.453ce00000000p-3, 0x1.8c12553c8fb29p-24),
+    (double2)(0x1.4d08700000000p-3, 0x1.53b49e2e8f991p-24),
+    (double2)(0x1.54d1800000000p-3, 0x1.7422ae148c141p-24),
+    (double2)(0x1.5c98100000000p-3, 0x1.e3ec269df56a8p-27),
+    (double2)(0x1.645bf00000000p-3, 0x1.ff6754e7e0ac9p-24),
+    (double2)(0x1.6c1d400000000p-3, 0x1.131267b1b5aadp-24),
+    (double2)(0x1.73dbd00000000p-3, 0x1.d14fa403a94bcp-24),
+    (double2)(0x1.7b97b00000000p-3, 0x1.2f396c089a3d8p-25),
+    (double2)(0x1.8350b00000000p-3, 0x1.c731d78fa95bbp-24),
+    (double2)(0x1.8b06e00000000p-3, 0x1.c50f385177399p-24),
+    (double2)(0x1.92ba300000000p-3, 0x1.f41409c6f2c20p-25),
+    (double2)(0x1.9a6a800000000p-3, 0x1.d2d90c4c39ec0p-24),
+    (double2)(0x1.a217e00000000p-3, 0x1.80420696f2106p-25),
+    (double2)(0x1.a9c2300000000p-3, 0x1.b40327943a2e8p-27),
+    (double2)(0x1.b169600000000p-3, 0x1.5d35e02f3d2a2p-25),
+    (double2)(0x1.b90d700000000p-3, 0x1.4a498288117b0p-25),
+    (double2)(0x1.c0ae500000000p-3, 0x1.35da119afb324p-25),
+    (double2)(0x1.c84bf00000000p-3, 0x1.14e85cdb9a908p-24),
+    (double2)(0x1.cfe6500000000p-3, 0x1.38754e5547b9ap-25),
+    (double2)(0x1.d77d500000000p-3, 0x1.be40ae6ce3246p-24),
+    (double2)(0x1.df11000000000p-3, 0x1.0c993b3bea7e7p-24),
+    (double2)(0x1.e6a1400000000p-3, 0x1.1d2dd89ac3359p-24),
+    (double2)(0x1.ee2e100000000p-3, 0x1.1476603332c46p-25),
+    (double2)(0x1.f5b7500000000p-3, 0x1.f25901bac55b7p-24),
+    (double2)(0x1.fd3d100000000p-3, 0x1.f881b7c826e28p-24),
+    (double2)(0x1.025fa00000000p-2, 0x1.441996d698d20p-24),
+    (double2)(0x1.061ee00000000p-2, 0x1.407ac521ea089p-23),
+    (double2)(0x1.09dc500000000p-2, 0x1.2fb0c6c4b1723p-23),
+    (double2)(0x1.0d97e00000000p-2, 0x1.ca135966a3e18p-23),
+    (double2)(0x1.1151a00000000p-2, 0x1.b1218e4d646e4p-25),
+    (double2)(0x1.1509700000000p-2, 0x1.d4e72a350d288p-25),
+    (double2)(0x1.18bf500000000p-2, 0x1.4617e2f04c329p-23),
+    (double2)(0x1.1c73500000000p-2, 0x1.096ec41e82650p-25),
+    (double2)(0x1.2025500000000p-2, 0x1.9f91f25773e6ep-24),
+    (double2)(0x1.23d5600000000p-2, 0x1.59c0820f1d674p-25),
+    (double2)(0x1.2783700000000p-2, 0x1.02bf7a2df1064p-25),
+    (double2)(0x1.2b2f700000000p-2, 0x1.fb36bfc40508fp-23),
+    (double2)(0x1.2ed9800000000p-2, 0x1.ea08f3f8dc892p-24),
+    (double2)(0x1.3281800000000p-2, 0x1.3ed6254656a0ep-24),
+    (double2)(0x1.3627700000000p-2, 0x1.b83f5e5e69c58p-25),
+    (double2)(0x1.39cb400000000p-2, 0x1.d6ec2af768592p-23),
+    (double2)(0x1.3d6d100000000p-2, 0x1.493889a226f94p-25),
+    (double2)(0x1.410cb00000000p-2, 0x1.5ad8fa65279bap-23),
+    (double2)(0x1.44aa400000000p-2, 0x1.b615784d45434p-25),
+    (double2)(0x1.4845a00000000p-2, 0x1.09a184368f145p-23),
+    (double2)(0x1.4bdee00000000p-2, 0x1.61a2439b0d91cp-24),
+    (double2)(0x1.4f75f00000000p-2, 0x1.ce1a65e39a978p-24),
+    (double2)(0x1.530ad00000000p-2, 0x1.32a39a93b6a66p-23),
+    (double2)(0x1.569d800000000p-2, 0x1.1c3699af804e7p-23),
+    (double2)(0x1.5a2e000000000p-2, 0x1.75e0f4e44ede8p-26),
+    (double2)(0x1.5dbc300000000p-2, 0x1.f77ced1a7a83bp-23),
+    (double2)(0x1.6148400000000p-2, 0x1.84e7f0cb1b500p-29),
+    (double2)(0x1.64d1f00000000p-2, 0x1.ec6b838b02dfep-23),
+    (double2)(0x1.6859700000000p-2, 0x1.3ebf4dfbeda87p-23),
+    (double2)(0x1.6bdea00000000p-2, 0x1.9397aed9cb475p-23),
+    (double2)(0x1.6f61900000000p-2, 0x1.07937bc239c54p-24),
+    (double2)(0x1.72e2200000000p-2, 0x1.aa754553131b6p-23),
+    (double2)(0x1.7660700000000p-2, 0x1.4a05d407c45dcp-24),
+    (double2)(0x1.79dc600000000p-2, 0x1.132231a206dd0p-23),
+    (double2)(0x1.7d56000000000p-2, 0x1.2d8ecfdd69c88p-24),
+    (double2)(0x1.80cd400000000p-2, 0x1.a852c74218606p-24),
+    (double2)(0x1.8442200000000p-2, 0x1.71bf2baeebb50p-23),
+    (double2)(0x1.87b4b00000000p-2, 0x1.83d7db7491820p-27),
+    (double2)(0x1.8b24d00000000p-2, 0x1.ca50d92b6da14p-25),
+    (double2)(0x1.8e92900000000p-2, 0x1.6f5cde8530298p-26),
+    (double2)(0x1.91fde00000000p-2, 0x1.f343198910740p-24),
+    (double2)(0x1.9566d00000000p-2, 0x1.0e8d241ccd80ap-24),
+    (double2)(0x1.98cd500000000p-2, 0x1.1535ac619e6c8p-24),
+    (double2)(0x1.9c31600000000p-2, 0x1.7316041c36cd2p-24),
+    (double2)(0x1.9f93000000000p-2, 0x1.985a000637d8ep-24),
+    (double2)(0x1.a2f2300000000p-2, 0x1.f2f29858c0a68p-25),
+    (double2)(0x1.a64ee00000000p-2, 0x1.879847f96d909p-23),
+    (double2)(0x1.a9a9200000000p-2, 0x1.ab3d319e12e42p-23),
+    (double2)(0x1.ad00f00000000p-2, 0x1.5088162dfc4c2p-24),
+    (double2)(0x1.b056400000000p-2, 0x1.05749a1cd9d8cp-25),
+    (double2)(0x1.b3a9100000000p-2, 0x1.da65c6c6b8618p-26),
+    (double2)(0x1.b6f9600000000p-2, 0x1.739bf7df1ad64p-25),
+    (double2)(0x1.ba47300000000p-2, 0x1.bc31252aa3340p-25),
+    (double2)(0x1.bd92800000000p-2, 0x1.e528191ad3aa8p-26),
+    (double2)(0x1.c0db400000000p-2, 0x1.929d93df19f18p-23),
+    (double2)(0x1.c421900000000p-2, 0x1.ff11eb693a080p-26),
+    (double2)(0x1.c765500000000p-2, 0x1.55ae3f145a3a0p-27),
+    (double2)(0x1.caa6800000000p-2, 0x1.cbcd8c6c0ca82p-24),
+    (double2)(0x1.cde5300000000p-2, 0x1.0cb04d425d304p-24),
+    (double2)(0x1.d121500000000p-2, 0x1.9adfcab5be678p-24),
+    (double2)(0x1.d45ae00000000p-2, 0x1.93d90c5662508p-23),
+    (double2)(0x1.d791f00000000p-2, 0x1.68489bd35ff40p-24),
+    (double2)(0x1.dac6700000000p-2, 0x1.586ed3da2b7e0p-28),
+    (double2)(0x1.ddf8500000000p-2, 0x1.7604d2e850eeep-23),
+    (double2)(0x1.e127b00000000p-2, 0x1.ac1d12bfb53d8p-24),
+    (double2)(0x1.e454800000000p-2, 0x1.9b3d468274740p-28),
+    (double2)(0x1.e77eb00000000p-2, 0x1.fc5d68d10e53cp-24),
+    (double2)(0x1.eaa6500000000p-2, 0x1.8f9e51884becbp-23),
+    (double2)(0x1.edcb600000000p-2, 0x1.a87f0869c06d1p-23),
+    (double2)(0x1.f0ede00000000p-2, 0x1.31e7279f685fap-23),
+    (double2)(0x1.f40dd00000000p-2, 0x1.6a8282f9719b0p-27),
+    (double2)(0x1.f72b200000000p-2, 0x1.0d2724a8a44e0p-25),
+    (double2)(0x1.fa45d00000000p-2, 0x1.a60524b11ad4ep-23),
+    (double2)(0x1.fd5e000000000p-2, 0x1.75fdf832750f0p-26),
+    (double2)(0x1.0039c00000000p-1, 0x1.cf06902e4cd36p-23),
+    (double2)(0x1.01c3400000000p-1, 0x1.e82422d4f6d10p-25),
+    (double2)(0x1.034b700000000p-1, 0x1.24a091063e6c0p-26),
+    (double2)(0x1.04d2500000000p-1, 0x1.8a1a172dc6f38p-24),
+    (double2)(0x1.0657e00000000p-1, 0x1.29b6619f8a92dp-22),
+    (double2)(0x1.07dc300000000p-1, 0x1.9274d9c1b70c8p-24),
+    (double2)(0x1.095f300000000p-1, 0x1.0c34b1fbb7930p-26),
+    (double2)(0x1.0ae0e00000000p-1, 0x1.639866c20eb50p-25),
+    (double2)(0x1.0c61400000000p-1, 0x1.6d6d0f6832e9ep-23),
+    (double2)(0x1.0de0500000000p-1, 0x1.af54def99f25ep-22),
+    (double2)(0x1.0f5e200000000p-1, 0x1.16cfc52a00262p-22),
+    (double2)(0x1.10daa00000000p-1, 0x1.dcc1e83569c32p-23),
+    (double2)(0x1.1255d00000000p-1, 0x1.37f7a551ed425p-22),
+    (double2)(0x1.13cfb00000000p-1, 0x1.f6360adc98887p-22),
+    (double2)(0x1.1548500000000p-1, 0x1.2c6ec8d35a2c1p-22),
+    (double2)(0x1.16bfa00000000p-1, 0x1.bd44df84cb036p-23),
+    (double2)(0x1.1835a00000000p-1, 0x1.117cf826e310ep-22),
+    (double2)(0x1.19aa500000000p-1, 0x1.ca533f332cfc9p-22),
+    (double2)(0x1.1b1dc00000000p-1, 0x1.0f208509dbc2ep-22),
+    (double2)(0x1.1c8fe00000000p-1, 0x1.cd07d93c945dep-23),
+    (double2)(0x1.1e00b00000000p-1, 0x1.57bdfd67e6d72p-22),
+    (double2)(0x1.1f70400000000p-1, 0x1.aab89c516c658p-24),
+    (double2)(0x1.20de800000000p-1, 0x1.3e823b1a1b8a0p-25),
+    (double2)(0x1.224b700000000p-1, 0x1.307464a9d6d3cp-23),
+    (double2)(0x1.23b7100000000p-1, 0x1.c5993cd438843p-22),
+    (double2)(0x1.2521700000000p-1, 0x1.ba2fca02ab554p-22),
+    (double2)(0x1.268a900000000p-1, 0x1.01a5b6983a268p-23),
+    (double2)(0x1.27f2600000000p-1, 0x1.273d1b350efc8p-25),
+    (double2)(0x1.2958e00000000p-1, 0x1.64c238c37b0c6p-23),
+    (double2)(0x1.2abe200000000p-1, 0x1.aded07370a300p-25),
+    (double2)(0x1.2c22100000000p-1, 0x1.78091197eb47ep-23),
+    (double2)(0x1.2d84c00000000p-1, 0x1.4b0f245e0dabcp-24),
+    (double2)(0x1.2ee6200000000p-1, 0x1.080d9794e2eafp-22),
+    (double2)(0x1.3046400000000p-1, 0x1.d4ec242b60c76p-23),
+    (double2)(0x1.31a5200000000p-1, 0x1.221d2f940caa0p-27),
+    (double2)(0x1.3302b00000000p-1, 0x1.cdbc42b2bba5cp-24),
+    (double2)(0x1.345f000000000p-1, 0x1.cce37bb440840p-25),
+    (double2)(0x1.35ba000000000p-1, 0x1.6c1d999cf1dd0p-22),
+    (double2)(0x1.3713d00000000p-1, 0x1.bed8a07eb0870p-26),
+    (double2)(0x1.386c500000000p-1, 0x1.69ed88f490e3cp-24),
+    (double2)(0x1.39c3900000000p-1, 0x1.cd41719b73ef0p-25),
+    (double2)(0x1.3b19800000000p-1, 0x1.cbc4ac95b41b7p-22),
+    (double2)(0x1.3c6e400000000p-1, 0x1.238f1b890f5d7p-22),
+    (double2)(0x1.3dc1c00000000p-1, 0x1.50c4282259cc4p-24),
+    (double2)(0x1.3f13f00000000p-1, 0x1.713d2de87b3e2p-22),
+    (double2)(0x1.4064f00000000p-1, 0x1.1d5a7d2255276p-23),
+    (double2)(0x1.41b4a00000000p-1, 0x1.c0dfd48227ac1p-22),
+    (double2)(0x1.4303200000000p-1, 0x1.1c964dab76753p-22),
+    (double2)(0x1.4450600000000p-1, 0x1.6de56d5704496p-23),
+    (double2)(0x1.459c600000000p-1, 0x1.4aeb71fd19968p-23),
+    (double2)(0x1.46e7200000000p-1, 0x1.fbf91c57b1918p-23),
+    (double2)(0x1.4830a00000000p-1, 0x1.d6bef7fbe5d9ap-22),
+    (double2)(0x1.4978f00000000p-1, 0x1.464d3dc249066p-22),
+    (double2)(0x1.4ac0000000000p-1, 0x1.638e2ec4d9073p-22),
+    (double2)(0x1.4c05e00000000p-1, 0x1.16f4a7247ea7cp-24),
+    (double2)(0x1.4d4a800000000p-1, 0x1.1a0a740f1d440p-28),
+    (double2)(0x1.4e8de00000000p-1, 0x1.6edbb0114a33cp-23),
+    (double2)(0x1.4fd0100000000p-1, 0x1.dbee8bf1d513cp-24),
+    (double2)(0x1.5111000000000p-1, 0x1.5b8bdb0248f73p-22),
+    (double2)(0x1.5250c00000000p-1, 0x1.7de3d3f5eac64p-22),
+    (double2)(0x1.538f500000000p-1, 0x1.ee24187ae448ap-23),
+    (double2)(0x1.54cca00000000p-1, 0x1.e06c591ec5192p-22),
+    (double2)(0x1.5608d00000000p-1, 0x1.4e3861a332738p-24),
+    (double2)(0x1.5743c00000000p-1, 0x1.a9599dcc2bfe4p-24),
+    (double2)(0x1.587d800000000p-1, 0x1.f732fbad43468p-25),
+    (double2)(0x1.59b6000000000p-1, 0x1.eb9f573b727d9p-22),
+    (double2)(0x1.5aed600000000p-1, 0x1.8b212a2eb9897p-22),
+    (double2)(0x1.5c23900000000p-1, 0x1.384884c167215p-22),
+    (double2)(0x1.5d58900000000p-1, 0x1.0e2d363020051p-22),
+    (double2)(0x1.5e8c600000000p-1, 0x1.2820879fbd022p-22),
+    (double2)(0x1.5fbf000000000p-1, 0x1.a1ab9893e4b30p-22),
+    (double2)(0x1.60f0800000000p-1, 0x1.2d1b817a24478p-23),
+    (double2)(0x1.6220d00000000p-1, 0x1.15d7b8ded4878p-25),
+    (double2)(0x1.634ff00000000p-1, 0x1.8968f9db3a5e4p-24),
+    (double2)(0x1.647de00000000p-1, 0x1.71c4171fe135fp-22),
+    (double2)(0x1.65aab00000000p-1, 0x1.6d80f605d0d8cp-22),
+    (double2)(0x1.66d6600000000p-1, 0x1.c91f043691590p-24),
+    (double2)(0x1.6800e00000000p-1, 0x1.39f8a15fce2b2p-23),
+    (double2)(0x1.692a400000000p-1, 0x1.55beda9d94b80p-27),
+    (double2)(0x1.6a52700000000p-1, 0x1.b12c15d60949ap-23),
+    (double2)(0x1.6b79800000000p-1, 0x1.24167b312bfe3p-22),
+    (double2)(0x1.6c9f700000000p-1, 0x1.0ab8633070277p-22),
+    (double2)(0x1.6dc4400000000p-1, 0x1.54554ebbc80eep-23),
+    (double2)(0x1.6ee7f00000000p-1, 0x1.0204aef5a4bb8p-25),
+    (double2)(0x1.700a700000000p-1, 0x1.8af08c679cf2cp-22),
+    (double2)(0x1.712be00000000p-1, 0x1.0852a330ae6c8p-22),
+    (double2)(0x1.724c300000000p-1, 0x1.6d3eb9ec32916p-23),
+    (double2)(0x1.736b600000000p-1, 0x1.685cb7fcbbafep-23),
+    (double2)(0x1.7489700000000p-1, 0x1.1f751c1e0bd95p-22),
+    (double2)(0x1.75a6700000000p-1, 0x1.705b1b0f72560p-26),
+    (double2)(0x1.76c2400000000p-1, 0x1.b98d8d808ca92p-22),
+    (double2)(0x1.77dd100000000p-1, 0x1.2ea22c75cc980p-25),
+    (double2)(0x1.78f6b00000000p-1, 0x1.7aba62bca0350p-22),
+    (double2)(0x1.7a0f400000000p-1, 0x1.d73833442278cp-22),
+    (double2)(0x1.7b26c00000000p-1, 0x1.5a5ca1fb18bf9p-22),
+    (double2)(0x1.7c3d300000000p-1, 0x1.1a6092b6ecf28p-25),
+    (double2)(0x1.7d52800000000p-1, 0x1.44fd049aac104p-24),
+    (double2)(0x1.7e66c00000000p-1, 0x1.c114fd8df5180p-29),
+    (double2)(0x1.7f79e00000000p-1, 0x1.5972f130feae5p-22),
+    (double2)(0x1.808c000000000p-1, 0x1.ca034a55fe198p-24),
+    (double2)(0x1.819d000000000p-1, 0x1.6e2b149990227p-22),
+    (double2)(0x1.82ad000000000p-1, 0x1.b00000294592cp-24),
+    (double2)(0x1.83bbe00000000p-1, 0x1.8b9bdc442620ep-22),
+    (double2)(0x1.84c9c00000000p-1, 0x1.d94fdfabf3e4ep-23),
+    (double2)(0x1.85d6900000000p-1, 0x1.5db30b145ad9ap-23),
+    (double2)(0x1.86e2500000000p-1, 0x1.e3e1eb95022b0p-23),
+    (double2)(0x1.87ed000000000p-1, 0x1.d5b8b45442bd6p-22),
+    (double2)(0x1.88f6b00000000p-1, 0x1.7a046231ecd2ep-22),
+    (double2)(0x1.89ff500000000p-1, 0x1.feafe3ef55232p-22),
+    (double2)(0x1.8b06f00000000p-1, 0x1.839e7bfd78267p-22),
+    (double2)(0x1.8c0d900000000p-1, 0x1.45cf49d6fa900p-25),
+    (double2)(0x1.8d13200000000p-1, 0x1.be3132b27f380p-27),
+    (double2)(0x1.8e17a00000000p-1, 0x1.533980bb84f9fp-22),
+    (double2)(0x1.8f1b300000000p-1, 0x1.889e2ce3ba390p-26),
+    (double2)(0x1.901db00000000p-1, 0x1.f7778c3ad0cc8p-24),
+    (double2)(0x1.911f300000000p-1, 0x1.46660cec4eba2p-23),
+    (double2)(0x1.921fb00000000p-1, 0x1.5110b4611a626p-23),
+)
+

diff --git a/amd-builtins/math64/atan2piD.cl b/amd-builtins/math64/atan2piD.cl
new file mode 100644
index 0000000..9f3f026
--- /dev/null
+++ b/amd-builtins/math64/atan2piD.cl

@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline, weak)) double
+atan2pi(double y, double x)
+{
+    USE_TABLE(double2, atan_jby256_tbl, ATAN_JBY256_TBL);
+
+    const double pi = 3.1415926535897932e+00;          /* 0x400921fb54442d18 */
+    const double pi_head = 3.1415926218032836e+00;     /* 0x400921fb50000000 */
+    const double pi_tail = 3.1786509547056392e-08;     /* 0x3e6110b4611a6263 */
+    const double piby2_head = 1.5707963267948965e+00;  /* 0x3ff921fb54442d18 */
+    const double piby2_tail = 6.1232339957367660e-17;  /* 0x3c91a62633145c07 */
+
+    double x2 = x;
+    int xneg = as_int2(x).hi < 0;
+    int xexp = (as_int2(x).hi >> 20) & 0x7ff;
+
+    double y2 = y;
+    int yneg = as_int2(y).hi < 0;
+    int yexp = (as_int2(y).hi >> 20) & 0x7ff;
+
+    int cond2 = (xexp < 1021) & (yexp < 1021);
+    int diffexp = yexp - xexp;
+
+    // Scale up both x and y if they are both below 1/4
+    double x1 = ldexp(x, 1024);
+    int xexp1 = (as_int2(x1).hi >> 20) & 0x7ff;
+    double y1 = ldexp(y, 1024);
+    int yexp1 = (as_int2(y1).hi >> 20) & 0x7ff;
+    int diffexp1 = yexp1 - xexp1;
+
+    diffexp = cond2 ? diffexp1 : diffexp;
+    x = cond2 ? x1 : x;
+    y = cond2 ? y1 : y;
+
+    // General case: take absolute values of arguments
+    double u = fabs(x);
+    double v = fabs(y);
+
+    // Swap u and v if necessary to obtain 0 < v < u. Compute v/u.
+    int swap_vu = u < v;
+    double uu = u;
+    u = swap_vu ? v : u;
+    v = swap_vu ? uu : v;
+
+    double vbyu = v / u;
+    double q1, q2;
+
+    // General values of v/u. Use a look-up table and series expansion.
+
+    {
+        double val = vbyu > 0.0625 ? vbyu : 0.063;
+        int index = convert_int(fma(256.0, val, 0.5));
+	double2 tv = atan_jby256_tbl[index - 16];
+	q1 = tv.s0;
+	q2 = tv.s1;
+        double c = (double)index * 0x1.0p-8;
+
+        // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1
+        // u_exponent could be EMAX so we have to do it in 2 steps
+        int m = -((int)(as_ulong(u) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
+	    double um = ldexp(u, m);
+	    double vm = ldexp(v, m);
+
+        // 26 leading bits of u
+        double u1 = as_double(as_ulong(um) & 0xfffffffff8000000UL);
+        double u2 = um - u1;
+
+        double r = MATH_DIVIDE(fma(-c, u2, fma(-c, u1, vm)), fma(c, vm, um));
+
+        // Polynomial approximation to atan(r)
+        double s = r * r;
+        q2 = q2 + fma((s * fma(-s, 0.19999918038989143496, 0.33333333333224095522)), -r, r);
+    }
+
+
+    double q3, q4;
+    {
+        q3 = 0.0;
+        q4 = vbyu;
+    }
+
+    double q5, q6;
+    {
+        double u1 = as_double(as_ulong(u) & 0xffffffff00000000UL);
+        double u2 = u - u1;
+        double vu1 = as_double(as_ulong(vbyu) & 0xffffffff00000000UL);
+        double vu2 = vbyu - vu1;
+
+        q5 = 0.0;
+        double s = vbyu * vbyu;
+        q6 = vbyu + fma(-vbyu * s,
+                        fma(-s,
+                            fma(-s,
+                                fma(-s,
+                                    fma(-s, 0.90029810285449784439E-01,
+                                        0.11110736283514525407),
+                                    0.14285713561807169030),
+                                0.19999999999393223405),
+                            0.33333333333333170500),
+			 MATH_DIVIDE(fma(-u, vu2, fma(-u2, vu1, fma(-u1, vu1, v))), u));
+    }
+
+
+    q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5;
+    q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6;
+
+    q1 = vbyu > 0.0625 ? q1 : q3;
+    q2 = vbyu > 0.0625 ? q2 : q4;
+
+    // Tidy-up according to which quadrant the arguments lie in
+    double res1, res2, res3, res4;
+    q1 = swap_vu ? piby2_head - q1 : q1;
+    q2 = swap_vu ? piby2_tail - q2 : q2;
+    q1 = xneg ? pi_head - q1 : q1;
+    q2 = xneg ? pi_tail - q2 : q2;
+    q1 = MATH_DIVIDE(q1 + q2, pi);
+    res4 = yneg ? -q1 : q1;
+
+    res1 = yneg ? -0.75 : 0.75;
+    res2 = yneg ? -0.25 : 0.25;
+    res3 = xneg ? res1 : res2;
+
+    res3 = isinf(y2) & isinf(x2) ? res3 : res4;
+    res1 = yneg ? -1.0 : 1.0;
+
+    // abs(x)/abs(y) > 2^56 and x < 0
+    res3 = (diffexp < -56 && xneg) ? res1 : res3;
+
+    res4 = MATH_DIVIDE(MATH_DIVIDE(y, x), pi);
+    // x positive and dominant over y by a factor of 2^28
+    res3 = diffexp < -28 & xneg == 0 ? res4 : res3;
+
+    // abs(y)/abs(x) > 2^56
+    res4 = yneg ? -0.5 : 0.5;        // atan(y/x) is insignificant compared to piby2
+    res3 = diffexp > 56 ? res4 : res3;
+
+    res3 = x2 == 0.0 ? res4 : res3;  // Zero x gives +- pi/2 depending on sign of y
+    res4 = xneg ? res1 : y2;
+
+    res3 = y2 == 0.0 ? res4 : res3;  // Zero y gives +-0 for positive x and +-pi for negative x
+    res3 = isnan(y2) ? y2 : res3;
+    res3 = isnan(x2) ? x2 : res3;
+
+    return res3;
+}

diff --git a/amd-builtins/math64/atanD.cl b/amd-builtins/math64/atanD.cl
new file mode 100644
index 0000000..0e803c7
--- /dev/null
+++ b/amd-builtins/math64/atanD.cl

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+atan(double x)
+{
+    const double piby2 = 1.5707963267948966e+00; // 0x3ff921fb54442d18
+
+    double v = fabs(x);
+
+    // 2^56 > v > 39/16
+    double a = -1.0;
+    double b = v;
+    // (chi + clo) = arctan(infinity)
+    double chi = 1.57079632679489655800e+00;
+    double clo = 6.12323399573676480327e-17;
+
+    double ta = v - 1.5;
+    double tb = 1.0 + 1.5 * v;
+    int l = v <= 0x1.38p+1; // 39/16 > v > 19/16
+    a = l ? ta : a;
+    b = l ? tb : b;
+    // (chi + clo) = arctan(1.5)
+    chi = l ? 9.82793723247329054082e-01 : chi;
+    clo = l ? 1.39033110312309953701e-17 : clo;
+
+    ta = v - 1.0;
+    tb = 1.0 + v;
+    l = v <= 0x1.3p+0; // 19/16 > v > 11/16
+    a = l ? ta : a;
+    b = l ? tb : b;
+    // (chi + clo) = arctan(1.)
+    chi = l ? 7.85398163397448278999e-01 : chi;
+    clo = l ? 3.06161699786838240164e-17 : clo;
+
+    ta = 2.0 * v - 1.0;
+    tb = 2.0 + v;
+    l = v <= 0x1.6p-1; // 11/16 > v > 7/16
+    a = l ? ta : a;
+    b = l ? tb : b;
+    // (chi + clo) = arctan(0.5)
+    chi = l ? 4.63647609000806093515e-01 : chi;
+    clo = l ? 2.26987774529616809294e-17 : clo;
+
+    l = v <= 0x1.cp-2; // v < 7/16
+    a = l ? v : a;
+    b = l ? 1.0 : b;;
+    chi = l ? 0.0 : chi;
+    clo = l ? 0.0 : clo;
+
+    // Core approximation: Remez(4,4) on [-7/16,7/16]
+    double r = a / b;
+    double s = r * r;
+    double qn = fma(s,
+                    fma(s,
+                        fma(s,
+                            fma(s, 0.142316903342317766e-3,
+                                   0.304455919504853031e-1),
+                            0.220638780716667420e0),
+                        0.447677206805497472e0),
+                    0.268297920532545909e0);
+
+    double qd = fma(s,
+	            fma(s,
+			fma(s,
+			    fma(s, 0.389525873944742195e-1,
+				   0.424602594203847109e0),
+                            0.141254259931958921e1),
+                        0.182596787737507063e1),
+                    0.804893761597637733e0);
+
+    double q = r * s * qn / qd;
+    r = chi - ((q - clo) - r);
+
+    double z = isnan(x) ? x : piby2;
+    z = v <= 0x1.0p+56 ? r : z;
+    z = v < 0x1.0p-26 ? v : z;
+    return x == v ? z : -z;
+}
+

diff --git a/amd-builtins/math64/atanhD.cl b/amd-builtins/math64/atanhD.cl
new file mode 100644
index 0000000..a362100
--- /dev/null
+++ b/amd-builtins/math64/atanhD.cl

@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+atanh(double x)
+{
+    double absx = fabs(x);
+
+    double ret = absx == 1.0 ? as_double(PINFBITPATT_DP64) : as_double(QNANBITPATT_DP64);
+
+    // |x| >= 0.5
+    // Note that atanh(x) = 0.5 * ln((1+x)/(1-x))
+    // For greater accuracy we use
+    // ln((1+x)/(1-x)) = ln(1 + 2x/(1-x)) = log1p(2x/(1-x)).
+    double r = 0.5 * log1p(2.0 * absx / (1.0 - absx));
+    ret = absx < 1.0 ? r : ret;
+
+    r = -ret;
+    ret = x < 0.0 ? r : ret;
+
+    // Arguments up to 0.5 in magnitude are
+    // approximated by a [5,5] minimax polynomial
+    double t = x * x;
+
+    double pn = fma(t,
+                    fma(t,
+                        fma(t,
+                            fma(t,
+                                fma(t, -0.10468158892753136958e-3, 0.28728638600548514553e-1),
+                                -0.28180210961780814148e0),
+                            0.88468142536501647470e0),
+                        -0.11028356797846341457e1),
+                    0.47482573589747356373e0);
+
+    double pd = fma(t,
+                    fma(t,
+                        fma(t,
+                            fma(t,
+                                fma(t, -0.35861554370169537512e-1, 0.49561196555503101989e0),
+                                -0.22608883748988489342e1),
+                            0.45414700626084508355e1),
+                        -0.41631933639693546274e1),
+                    0.14244772076924206909e1);
+
+    r = fma(x*t, pn/pd, x);
+    ret = absx < 0.5 ? r : ret;
+
+    return ret;
+}
+

diff --git a/amd-builtins/math64/atanpiD.cl b/amd-builtins/math64/atanpiD.cl
new file mode 100644
index 0000000..0ce71e6
--- /dev/null
+++ b/amd-builtins/math64/atanpiD.cl

@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+atanpi(double x)
+{
+    const double pi = 0x1.921fb54442d18p+1;
+
+    double v = fabs(x);
+
+    // 2^56 > v > 39/16
+    double a = -1.0;
+    double b = v;
+    // (chi + clo) = arctan(infinity)
+    double chi = 1.57079632679489655800e+00;
+    double clo = 6.12323399573676480327e-17;
+
+    double ta = v - 1.5;
+    double tb = 1.0 + 1.5 * v;
+    int l = v <= 0x1.38p+1; // 39/16 > v > 19/16
+    a = l ? ta : a;
+    b = l ? tb : b;
+    // (chi + clo) = arctan(1.5)
+    chi = l ? 9.82793723247329054082e-01 : chi;
+    clo = l ? 1.39033110312309953701e-17 : clo;
+
+    ta = v - 1.0;
+    tb = 1.0 + v;
+    l = v <= 0x1.3p+0; // 19/16 > v > 11/16
+    a = l ? ta : a;
+    b = l ? tb : b;
+    // (chi + clo) = arctan(1.)
+    chi = l ? 7.85398163397448278999e-01 : chi;
+    clo = l ? 3.06161699786838240164e-17 : clo;
+
+    ta = 2.0 * v - 1.0;
+    tb = 2.0 + v;
+    l = v <= 0x1.6p-1; // 11/16 > v > 7/16
+    a = l ? ta : a;
+    b = l ? tb : b;
+    // (chi + clo) = arctan(0.5)
+    chi = l ? 4.63647609000806093515e-01 : chi;
+    clo = l ? 2.26987774529616809294e-17 : clo;
+
+    l = v <= 0x1.cp-2; // v < 7/16
+    a = l ? v : a;
+    b = l ? 1.0 : b;;
+    chi = l ? 0.0 : chi;
+    clo = l ? 0.0 : clo;
+
+    // Core approximation: Remez(4,4) on [-7/16,7/16]
+    double r = a / b;
+    double s = r * r;
+    double qn = fma(s,
+                    fma(s,
+                        fma(s,
+                            fma(s, 0.142316903342317766e-3,
+                                   0.304455919504853031e-1),
+                            0.220638780716667420e0),
+                        0.447677206805497472e0),
+                    0.268297920532545909e0);
+
+    double qd = fma(s,
+	            fma(s,
+			fma(s,
+			    fma(s, 0.389525873944742195e-1,
+				   0.424602594203847109e0),
+                            0.141254259931958921e1),
+                        0.182596787737507063e1),
+                    0.804893761597637733e0);
+
+    double q = r * s * qn / qd;
+    r = (chi - ((q - clo) - r)) / pi;
+    double vp = v / pi;
+
+    double z = isnan(x) ? x : 0.5;
+    z = v <= 0x1.0p+56 ? r : z;
+    z = v < 0x1.0p-26 ? vp : z;
+    return x == v ? z : -z;
+}
+

diff --git a/amd-builtins/math64/cbrtD.cl b/amd-builtins/math64/cbrtD.cl
new file mode 100644
index 0000000..9f0c688
--- /dev/null
+++ b/amd-builtins/math64/cbrtD.cl

@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+// Algorithm:
+//
+// x = (2^m)*A
+// x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8))
+// x = (2^m)*2*(G/2+g/2)
+// x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9))
+//
+// Y = (2^(-1))*(2^(-m))*(2^m)*A
+// Now, range of Y is: 0.5 <= Y < 1
+//
+// F = 0x100 + (first 7 mantissa bits) + (8th mantissa bit)
+// Now, range of F is: 128 <= F <= 256
+// F = F / 256
+// Now, range of F is: 0.5 <= F <= 1
+//
+// f = (Y-F), with (f <= 2^(-9))
+//
+// cbrt(x) = cbrt(2^m) * cbrt(2) * cbrt(F+f)
+// cbrt(x) = cbrt(2^m) * cbrt(2) * cbrt(F) + cbrt(1+(f/F))
+// cbrt(x) = cbrt(2^m) * cbrt(2*F) * cbrt(1+r)
+//
+// r = (f/F), with (r <= 2^(-8))
+// r = f*(1/F) with (1/F) precomputed to avoid division
+//
+// cbrt(x) = cbrt(2^m) * cbrt(G) * (1+poly)
+//
+// poly = c1*r + c2*(r^2) + c3*(r^3) + c4*(r^4) + c5*(r^5) + c6*(r^6)
+
+
+__attribute__((overloadable)) double
+cbrt(double x)
+{
+    USE_TABLE(double, p_inv, CBRT_TBL_INV);
+    USE_TABLE(double2, p_cbrt, CBRT_TBL);
+    USE_TABLE(double2, p_rem, CBRT_TBL_REM);
+
+
+    int return_x = isinf(x) | isnan(x) | x == 0.0;
+    ulong ux = as_ulong(fabs(x));
+    int m = (as_int2(ux).hi >> 20) - 1023;
+
+    // Treat subnormals
+    ulong uxs = as_ulong(as_double(0x3ff0000000000000UL | ux) - 1.0);
+    int ms = m + (as_int2(uxs).hi >> 20) - 1022;
+
+    int c = m == -1023;
+    ux = c ? uxs : ux;
+    m = c ? ms : m;
+
+    int mby3 = m / 3;
+    int rem = m - 3*mby3;
+
+    double mf = as_double((ulong)(mby3 + 1023) << 52);
+
+    ux &= 0x000fffffffffffffUL;
+    double Y = as_double(0x3fe0000000000000UL | ux);
+
+    // nearest integer
+    int index = as_int2(ux).hi >> 11;
+    index = (0x100 | (index >> 1)) + (index & 1);
+    double F = (double)index * 0x1.0p-9;
+    
+    double f = Y - F;
+    double r = f * p_inv[index-256];
+
+    double z = r * fma(r,
+                       fma(r,
+                           fma(r,
+                               fma(r,
+                                   fma(r, -0x1.8090d6221a247p-6, 0x1.ee7113506ac13p-6),
+                                   -0x1.511e8d2b3183bp-5),
+                               0x1.f9add3c0ca458p-5),
+                           -0x1.c71c71c71c71cp-4),
+                       0x1.5555555555555p-2);
+
+    double2 tv = p_rem[rem+2];
+    double Rem_h = tv.s0;
+    double Rem_t = tv.s1;
+
+    tv = p_cbrt[index-256];
+    double F_h = tv.s0;
+    double F_t = tv.s1;
+
+    double b_h = F_h * Rem_h; 
+    double b_t = fma(Rem_t, F_h, fma(F_t, Rem_h, F_t*Rem_t));
+
+    double ans = fma(z, b_h, fma(z, b_t, b_t)) + b_h;
+    ans = copysign(ans*mf, x);
+    return return_x ? x : ans;
+}
+

diff --git a/amd-builtins/math64/cbrtD_table.h b/amd-builtins/math64/cbrtD_table.h
new file mode 100644
index 0000000..5dbfe55
--- /dev/null
+++ b/amd-builtins/math64/cbrtD_table.h

@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+DECLARE_TABLE(double, CBRT_TBL_INV, 257,
+    0x1.0000000000000p+1,
+    0x1.fe01fe01fe020p+0,
+    0x1.fc07f01fc07f0p+0,
+    0x1.fa11caa01fa12p+0,
+    0x1.f81f81f81f820p+0,
+    0x1.f6310aca0dbb5p+0,
+    0x1.f44659e4a4271p+0,
+    0x1.f25f644230ab5p+0,
+    0x1.f07c1f07c1f08p+0,
+    0x1.ee9c7f8458e02p+0,
+    0x1.ecc07b301ecc0p+0,
+    0x1.eae807aba01ebp+0,
+    0x1.e9131abf0b767p+0,
+    0x1.e741aa59750e4p+0,
+    0x1.e573ac901e574p+0,
+    0x1.e3a9179dc1a73p+0,
+    0x1.e1e1e1e1e1e1ep+0,
+    0x1.e01e01e01e01ep+0,
+    0x1.de5d6e3f8868ap+0,
+    0x1.dca01dca01dcap+0,
+    0x1.dae6076b981dbp+0,
+    0x1.d92f2231e7f8ap+0,
+    0x1.d77b654b82c34p+0,
+    0x1.d5cac807572b2p+0,
+    0x1.d41d41d41d41dp+0,
+    0x1.d272ca3fc5b1ap+0,
+    0x1.d0cb58f6ec074p+0,
+    0x1.cf26e5c44bfc6p+0,
+    0x1.cd85689039b0bp+0,
+    0x1.cbe6d9601cbe7p+0,
+    0x1.ca4b3055ee191p+0,
+    0x1.c8b265afb8a42p+0,
+    0x1.c71c71c71c71cp+0,
+    0x1.c5894d10d4986p+0,
+    0x1.c3f8f01c3f8f0p+0,
+    0x1.c26b5392ea01cp+0,
+    0x1.c0e070381c0e0p+0,
+    0x1.bf583ee868d8bp+0,
+    0x1.bdd2b899406f7p+0,
+    0x1.bc4fd65883e7bp+0,
+    0x1.bacf914c1bad0p+0,
+    0x1.b951e2b18ff23p+0,
+    0x1.b7d6c3dda338bp+0,
+    0x1.b65e2e3beee05p+0,
+    0x1.b4e81b4e81b4fp+0,
+    0x1.b37484ad806cep+0,
+    0x1.b2036406c80d9p+0,
+    0x1.b094b31d922a4p+0,
+    0x1.af286bca1af28p+0,
+    0x1.adbe87f94905ep+0,
+    0x1.ac5701ac5701bp+0,
+    0x1.aaf1d2f87ebfdp+0,
+    0x1.a98ef606a63bep+0,
+    0x1.a82e65130e159p+0,
+    0x1.a6d01a6d01a6dp+0,
+    0x1.a574107688a4ap+0,
+    0x1.a41a41a41a41ap+0,
+    0x1.a2c2a87c51ca0p+0,
+    0x1.a16d3f97a4b02p+0,
+    0x1.a01a01a01a01ap+0,
+    0x1.9ec8e951033d9p+0,
+    0x1.9d79f176b682dp+0,
+    0x1.9c2d14ee4a102p+0,
+    0x1.9ae24ea5510dap+0,
+    0x1.999999999999ap+0,
+    0x1.9852f0d8ec0ffp+0,
+    0x1.970e4f80cb872p+0,
+    0x1.95cbb0be377aep+0,
+    0x1.948b0fcd6e9e0p+0,
+    0x1.934c67f9b2ce6p+0,
+    0x1.920fb49d0e229p+0,
+    0x1.90d4f120190d5p+0,
+    0x1.8f9c18f9c18fap+0,
+    0x1.8e6527af1373fp+0,
+    0x1.8d3018d3018d3p+0,
+    0x1.8bfce8062ff3ap+0,
+    0x1.8acb90f6bf3aap+0,
+    0x1.899c0f601899cp+0,
+    0x1.886e5f0abb04ap+0,
+    0x1.87427bcc092b9p+0,
+    0x1.8618618618618p+0,
+    0x1.84f00c2780614p+0,
+    0x1.83c977ab2beddp+0,
+    0x1.82a4a0182a4a0p+0,
+    0x1.8181818181818p+0,
+    0x1.8060180601806p+0,
+    0x1.7f405fd017f40p+0,
+    0x1.7e225515a4f1dp+0,
+    0x1.7d05f417d05f4p+0,
+    0x1.7beb3922e017cp+0,
+    0x1.7ad2208e0ecc3p+0,
+    0x1.79baa6bb6398bp+0,
+    0x1.78a4c8178a4c8p+0,
+    0x1.77908119ac60dp+0,
+    0x1.767dce434a9b1p+0,
+    0x1.756cac201756dp+0,
+    0x1.745d1745d1746p+0,
+    0x1.734f0c541fe8dp+0,
+    0x1.724287f46debcp+0,
+    0x1.713786d9c7c09p+0,
+    0x1.702e05c0b8170p+0,
+    0x1.6f26016f26017p+0,
+    0x1.6e1f76b4337c7p+0,
+    0x1.6d1a62681c861p+0,
+    0x1.6c16c16c16c17p+0,
+    0x1.6b1490aa31a3dp+0,
+    0x1.6a13cd1537290p+0,
+    0x1.691473a88d0c0p+0,
+    0x1.6816816816817p+0,
+    0x1.6719f3601671ap+0,
+    0x1.661ec6a5122f9p+0,
+    0x1.6524f853b4aa3p+0,
+    0x1.642c8590b2164p+0,
+    0x1.63356b88ac0dep+0,
+    0x1.623fa77016240p+0,
+    0x1.614b36831ae94p+0,
+    0x1.6058160581606p+0,
+    0x1.5f66434292dfcp+0,
+    0x1.5e75bb8d015e7p+0,
+    0x1.5d867c3ece2a5p+0,
+    0x1.5c9882b931057p+0,
+    0x1.5babcc647fa91p+0,
+    0x1.5ac056b015ac0p+0,
+    0x1.59d61f123ccaap+0,
+    0x1.58ed2308158edp+0,
+    0x1.5805601580560p+0,
+    0x1.571ed3c506b3ap+0,
+    0x1.56397ba7c52e2p+0,
+    0x1.5555555555555p+0,
+    0x1.54725e6bb82fep+0,
+    0x1.5390948f40febp+0,
+    0x1.52aff56a8054bp+0,
+    0x1.51d07eae2f815p+0,
+    0x1.50f22e111c4c5p+0,
+    0x1.5015015015015p+0,
+    0x1.4f38f62dd4c9bp+0,
+    0x1.4e5e0a72f0539p+0,
+    0x1.4d843bedc2c4cp+0,
+    0x1.4cab88725af6ep+0,
+    0x1.4bd3edda68fe1p+0,
+    0x1.4afd6a052bf5bp+0,
+    0x1.4a27fad76014ap+0,
+    0x1.49539e3b2d067p+0,
+    0x1.4880522014880p+0,
+    0x1.47ae147ae147bp+0,
+    0x1.46dce34596066p+0,
+    0x1.460cbc7f5cf9ap+0,
+    0x1.453d9e2c776cap+0,
+    0x1.446f86562d9fbp+0,
+    0x1.43a2730abee4dp+0,
+    0x1.42d6625d51f87p+0,
+    0x1.420b5265e5951p+0,
+    0x1.4141414141414p+0,
+    0x1.40782d10e6566p+0,
+    0x1.3fb013fb013fbp+0,
+    0x1.3ee8f42a5af07p+0,
+    0x1.3e22cbce4a902p+0,
+    0x1.3d5d991aa75c6p+0,
+    0x1.3c995a47babe7p+0,
+    0x1.3bd60d9232955p+0,
+    0x1.3b13b13b13b14p+0,
+    0x1.3a524387ac822p+0,
+    0x1.3991c2c187f63p+0,
+    0x1.38d22d366088ep+0,
+    0x1.3813813813814p+0,
+    0x1.3755bd1c945eep+0,
+    0x1.3698df3de0748p+0,
+    0x1.35dce5f9f2af8p+0,
+    0x1.3521cfb2b78c1p+0,
+    0x1.34679ace01346p+0,
+    0x1.33ae45b57bcb2p+0,
+    0x1.32f5ced6a1dfap+0,
+    0x1.323e34a2b10bfp+0,
+    0x1.3187758e9ebb6p+0,
+    0x1.30d190130d190p+0,
+    0x1.301c82ac40260p+0,
+    0x1.2f684bda12f68p+0,
+    0x1.2eb4ea1fed14bp+0,
+    0x1.2e025c04b8097p+0,
+    0x1.2d50a012d50a0p+0,
+    0x1.2c9fb4d812ca0p+0,
+    0x1.2bef98e5a3711p+0,
+    0x1.2b404ad012b40p+0,
+    0x1.2a91c92f3c105p+0,
+    0x1.29e4129e4129ep+0,
+    0x1.293725bb804a5p+0,
+    0x1.288b01288b013p+0,
+    0x1.27dfa38a1ce4dp+0,
+    0x1.27350b8812735p+0,
+    0x1.268b37cd60127p+0,
+    0x1.25e22708092f1p+0,
+    0x1.2539d7e9177b2p+0,
+    0x1.2492492492492p+0,
+    0x1.23eb79717605bp+0,
+    0x1.23456789abcdfp+0,
+    0x1.22a0122a0122ap+0,
+    0x1.21fb78121fb78p+0,
+    0x1.21579804855e6p+0,
+    0x1.20b470c67c0d9p+0,
+    0x1.2012012012012p+0,
+    0x1.1f7047dc11f70p+0,
+    0x1.1ecf43c7fb84cp+0,
+    0x1.1e2ef3b3fb874p+0,
+    0x1.1d8f5672e4abdp+0,
+    0x1.1cf06ada2811dp+0,
+    0x1.1c522fc1ce059p+0,
+    0x1.1bb4a4046ed29p+0,
+    0x1.1b17c67f2bae3p+0,
+    0x1.1a7b9611a7b96p+0,
+    0x1.19e0119e0119ep+0,
+    0x1.19453808ca29cp+0,
+    0x1.18ab083902bdbp+0,
+    0x1.1811811811812p+0,
+    0x1.1778a191bd684p+0,
+    0x1.16e0689427379p+0,
+    0x1.1648d50fc3201p+0,
+    0x1.15b1e5f75270dp+0,
+    0x1.151b9a3fdd5c9p+0,
+    0x1.1485f0e0acd3bp+0,
+    0x1.13f0e8d344724p+0,
+    0x1.135c81135c811p+0,
+    0x1.12c8b89edc0acp+0,
+    0x1.12358e75d3033p+0,
+    0x1.11a3019a74826p+0,
+    0x1.1111111111111p+0,
+    0x1.107fbbe011080p+0,
+    0x1.0fef010fef011p+0,
+    0x1.0f5edfab325a2p+0,
+    0x1.0ecf56be69c90p+0,
+    0x1.0e40655826011p+0,
+    0x1.0db20a88f4696p+0,
+    0x1.0d24456359e3ap+0,
+    0x1.0c9714fbcda3bp+0,
+    0x1.0c0a7868b4171p+0,
+    0x1.0b7e6ec259dc8p+0,
+    0x1.0af2f722eecb5p+0,
+    0x1.0a6810a6810a7p+0,
+    0x1.09ddba6af8360p+0,
+    0x1.0953f39010954p+0,
+    0x1.08cabb37565e2p+0,
+    0x1.0842108421084p+0,
+    0x1.07b9f29b8eae2p+0,
+    0x1.073260a47f7c6p+0,
+    0x1.06ab59c7912fbp+0,
+    0x1.0624dd2f1a9fcp+0,
+    0x1.059eea0727586p+0,
+    0x1.05197f7d73404p+0,
+    0x1.04949cc1664c5p+0,
+    0x1.0410410410410p+0,
+    0x1.038c6b78247fcp+0,
+    0x1.03091b51f5e1ap+0,
+    0x1.02864fc7729e9p+0,
+    0x1.0204081020408p+0,
+    0x1.0182436517a37p+0,
+    0x1.0101010101010p+0,
+    0x1.0080402010080p+0,
+    0x1.0000000000000p+0,
+)
+
+DECLARE_TABLE(double2, CBRT_TBL, 257,
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.0055380000000p+0, 0x1.e6a24c81e4294p-25),
+    (double2)(0x1.00aa390000000p+0, 0x1.8548511e3a785p-26),
+    (double2)(0x1.00ff010000000p+0, 0x1.4eb9336ec07f6p-25),
+    (double2)(0x1.0153920000000p+0, 0x1.0ea64b8b750e1p-27),
+    (double2)(0x1.01a7eb0000000p+0, 0x1.61637cff8a53cp-27),
+    (double2)(0x1.01fc0d0000000p+0, 0x1.0733bf7bd1943p-27),
+    (double2)(0x1.024ff80000000p+0, 0x1.666911345ccedp-26),
+    (double2)(0x1.02a3ad0000000p+0, 0x1.77b7a3f592f14p-27),
+    (double2)(0x1.02f72b0000000p+0, 0x1.f18d3dd1a5402p-25),
+    (double2)(0x1.034a750000000p+0, 0x1.be2f5a58ee9a4p-29),
+    (double2)(0x1.039d880000000p+0, 0x1.8901f8f085fa7p-25),
+    (double2)(0x1.03f0670000000p+0, 0x1.c68b8cd5b5d69p-26),
+    (double2)(0x1.0443110000000p+0, 0x1.a6b0e8624be42p-26),
+    (double2)(0x1.0495870000000p+0, 0x1.c4b22b06f68e7p-36),
+    (double2)(0x1.04e7c80000000p+0, 0x1.0f3f0afcabe9bp-25),
+    (double2)(0x1.0539d60000000p+0, 0x1.48495bca4e1b7p-26),
+    (double2)(0x1.058bb00000000p+0, 0x1.6107f1abdfdc3p-25),
+    (double2)(0x1.05dd570000000p+0, 0x1.e67261878288ap-25),
+    (double2)(0x1.062ecc0000000p+0, 0x1.a6bc155286f1ep-26),
+    (double2)(0x1.06800e0000000p+0, 0x1.8a759c64a85f2p-26),
+    (double2)(0x1.06d11e0000000p+0, 0x1.5fce70a4a8d09p-27),
+    (double2)(0x1.0721fc0000000p+0, 0x1.2f9cbf373fe1dp-28),
+    (double2)(0x1.0772a80000000p+0, 0x1.90564ce4ac359p-26),
+    (double2)(0x1.07c3230000000p+0, 0x1.ac29ce761b02fp-26),
+    (double2)(0x1.08136d0000000p+0, 0x1.cb752f497381cp-26),
+    (double2)(0x1.0863860000000p+0, 0x1.8bb9e1cfb35e0p-25),
+    (double2)(0x1.08b36f0000000p+0, 0x1.5b4917099de90p-25),
+    (double2)(0x1.0903280000000p+0, 0x1.cc77ac9c65ef2p-26),
+    (double2)(0x1.0952b10000000p+0, 0x1.7a0f3e7be3dbap-26),
+    (double2)(0x1.09a20a0000000p+0, 0x1.6ec851ee0c16fp-25),
+    (double2)(0x1.09f1340000000p+0, 0x1.89449bf2946dap-25),
+    (double2)(0x1.0a402f0000000p+0, 0x1.98f25301ba223p-25),
+    (double2)(0x1.0a8efc0000000p+0, 0x1.47d5ec651f549p-28),
+    (double2)(0x1.0add990000000p+0, 0x1.c33ec9a86007ap-25),
+    (double2)(0x1.0b2c090000000p+0, 0x1.e0b6653e92649p-26),
+    (double2)(0x1.0b7a4b0000000p+0, 0x1.bd64ac09d755fp-28),
+    (double2)(0x1.0bc85f0000000p+0, 0x1.f537506f78167p-29),
+    (double2)(0x1.0c16450000000p+0, 0x1.2c382d1b3735ep-25),
+    (double2)(0x1.0c63fe0000000p+0, 0x1.e20ed659f99e1p-25),
+    (double2)(0x1.0cb18b0000000p+0, 0x1.86b633a9c182ap-26),
+    (double2)(0x1.0cfeeb0000000p+0, 0x1.45cfd5a65e777p-27),
+    (double2)(0x1.0d4c1e0000000p+0, 0x1.0c8770f58bca4p-25),
+    (double2)(0x1.0d99250000000p+0, 0x1.739e44b0933c5p-25),
+    (double2)(0x1.0de6010000000p+0, 0x1.27dc3d9ce7bd8p-31),
+    (double2)(0x1.0e32b00000000p+0, 0x1.3c53c7c5a7b64p-25),
+    (double2)(0x1.0e7f340000000p+0, 0x1.9669683830cecp-25),
+    (double2)(0x1.0ecb8d0000000p+0, 0x1.8d772c39bdcc4p-25),
+    (double2)(0x1.0f17bb0000000p+0, 0x1.9b0008bcf6d7bp-25),
+    (double2)(0x1.0f63bf0000000p+0, 0x1.bbb305825ce4fp-28),
+    (double2)(0x1.0faf970000000p+0, 0x1.da3f4af13a406p-25),
+    (double2)(0x1.0ffb460000000p+0, 0x1.f36b96f74ce86p-26),
+    (double2)(0x1.1046cb0000000p+0, 0x1.65c002303f790p-30),
+    (double2)(0x1.1092250000000p+0, 0x1.82f84095ba7d5p-25),
+    (double2)(0x1.10dd560000000p+0, 0x1.d46433541b2c6p-25),
+    (double2)(0x1.11285e0000000p+0, 0x1.71c3d56e93a89p-25),
+    (double2)(0x1.11733d0000000p+0, 0x1.98dcef4e40012p-26),
+    (double2)(0x1.11bdf30000000p+0, 0x1.530ebef17fe03p-27),
+    (double2)(0x1.1208800000000p+0, 0x1.e8b8fa3715066p-27),
+    (double2)(0x1.1252e40000000p+0, 0x1.ab26eb3b211dcp-25),
+    (double2)(0x1.129d210000000p+0, 0x1.54dd4dc906307p-27),
+    (double2)(0x1.12e7350000000p+0, 0x1.c9f962387984ep-26),
+    (double2)(0x1.1331210000000p+0, 0x1.c62a959afec09p-25),
+    (double2)(0x1.137ae60000000p+0, 0x1.638d9ac6a866ap-25),
+    (double2)(0x1.13c4840000000p+0, 0x1.38704eca8a22dp-28),
+    (double2)(0x1.140dfa0000000p+0, 0x1.e6c9e1db14f8fp-27),
+    (double2)(0x1.1457490000000p+0, 0x1.8744b7f9c9eaap-26),
+    (double2)(0x1.14a0710000000p+0, 0x1.6c2893486373bp-25),
+    (double2)(0x1.14e9730000000p+0, 0x1.b36bce31699b7p-26),
+    (double2)(0x1.15324e0000000p+0, 0x1.71e3813d200c7p-25),
+    (double2)(0x1.157b030000000p+0, 0x1.99755ab40aa88p-25),
+    (double2)(0x1.15c3920000000p+0, 0x1.b45ca0e4bcfc0p-25),
+    (double2)(0x1.160bfc0000000p+0, 0x1.2dd090d869c5dp-28),
+    (double2)(0x1.16543f0000000p+0, 0x1.4fe0516b917dap-25),
+    (double2)(0x1.169c5d0000000p+0, 0x1.94563226317a2p-25),
+    (double2)(0x1.16e4560000000p+0, 0x1.53d8fafc2c851p-25),
+    (double2)(0x1.172c2a0000000p+0, 0x1.dcbd41fbd41a3p-26),
+    (double2)(0x1.1773d90000000p+0, 0x1.862ff5285f59cp-26),
+    (double2)(0x1.17bb630000000p+0, 0x1.3072ea97a1e1cp-25),
+    (double2)(0x1.1802c90000000p+0, 0x1.2839075184805p-26),
+    (double2)(0x1.184a0a0000000p+0, 0x1.4b0323e9eff42p-25),
+    (double2)(0x1.1891270000000p+0, 0x1.b158893c45484p-25),
+    (double2)(0x1.18d8210000000p+0, 0x1.149ef0fc35826p-28),
+    (double2)(0x1.191ef60000000p+0, 0x1.f2e77ea96acaap-26),
+    (double2)(0x1.1965a80000000p+0, 0x1.200074c471a95p-26),
+    (double2)(0x1.19ac360000000p+0, 0x1.3f8cc517f6f04p-25),
+    (double2)(0x1.19f2a10000000p+0, 0x1.60ba2e311bb55p-25),
+    (double2)(0x1.1a38e90000000p+0, 0x1.4b788730bbec3p-25),
+    (double2)(0x1.1a7f0e0000000p+0, 0x1.57090795ee20cp-25),
+    (double2)(0x1.1ac5100000000p+0, 0x1.d9ffe983670b1p-25),
+    (double2)(0x1.1b0af00000000p+0, 0x1.2a463ff61bfdap-25),
+    (double2)(0x1.1b50ad0000000p+0, 0x1.9d1bc6a5e65cfp-25),
+    (double2)(0x1.1b96480000000p+0, 0x1.8718abaa9e922p-25),
+    (double2)(0x1.1bdbc10000000p+0, 0x1.3c2f52ffa342ep-25),
+    (double2)(0x1.1c21180000000p+0, 0x1.0fae13ff42c80p-25),
+    (double2)(0x1.1c664d0000000p+0, 0x1.5440f0ef00d57p-25),
+    (double2)(0x1.1cab610000000p+0, 0x1.6fcd22d4e3c1ep-27),
+    (double2)(0x1.1cf0530000000p+0, 0x1.e0c60b409e863p-27),
+    (double2)(0x1.1d35230000000p+0, 0x1.f9cab5a5f0333p-25),
+    (double2)(0x1.1d79d30000000p+0, 0x1.30f24744c333dp-25),
+    (double2)(0x1.1dbe620000000p+0, 0x1.b50622a76b2fep-27),
+    (double2)(0x1.1e02cf0000000p+0, 0x1.fdb94ba595375p-25),
+    (double2)(0x1.1e471d0000000p+0, 0x1.861b9b945a171p-28),
+    (double2)(0x1.1e8b490000000p+0, 0x1.54348015188c4p-25),
+    (double2)(0x1.1ecf550000000p+0, 0x1.b54d149865523p-25),
+    (double2)(0x1.1f13410000000p+0, 0x1.a0bb783d9de33p-25),
+    (double2)(0x1.1f570d0000000p+0, 0x1.629d12b1a2157p-25),
+    (double2)(0x1.1f9ab90000000p+0, 0x1.467fe35d179dfp-25),
+    (double2)(0x1.1fde450000000p+0, 0x1.9763f3e26c8f7p-25),
+    (double2)(0x1.2021b20000000p+0, 0x1.3f798bb9f7679p-26),
+    (double2)(0x1.2064ff0000000p+0, 0x1.52e577e855898p-26),
+    (double2)(0x1.20a82c0000000p+0, 0x1.fde47e5502c3ap-25),
+    (double2)(0x1.20eb3b0000000p+0, 0x1.cbd0b548d96a0p-26),
+    (double2)(0x1.212e2a0000000p+0, 0x1.a9cd9f7be8de8p-25),
+    (double2)(0x1.2170fb0000000p+0, 0x1.22bbe704886dep-26),
+    (double2)(0x1.21b3ac0000000p+0, 0x1.e3dea8317f020p-25),
+    (double2)(0x1.21f63f0000000p+0, 0x1.e812085ac8855p-25),
+    (double2)(0x1.2238b40000000p+0, 0x1.c87144f24cb07p-26),
+    (double2)(0x1.227b0a0000000p+0, 0x1.1e128ee311fa2p-25),
+    (double2)(0x1.22bd420000000p+0, 0x1.b5c163d61a2d3p-26),
+    (double2)(0x1.22ff5c0000000p+0, 0x1.7d97e7fb90633p-27),
+    (double2)(0x1.2341570000000p+0, 0x1.efe899d50f6a7p-25),
+    (double2)(0x1.2383350000000p+0, 0x1.d0333eb75de5ap-25),
+    (double2)(0x1.23c4f60000000p+0, 0x1.0e590be73a573p-27),
+    (double2)(0x1.2406980000000p+0, 0x1.8ce8dcac3cdd2p-25),
+    (double2)(0x1.24481d0000000p+0, 0x1.ee8a48954064bp-25),
+    (double2)(0x1.2489850000000p+0, 0x1.aa62f18461e09p-25),
+    (double2)(0x1.24cad00000000p+0, 0x1.01e5940986a15p-25),
+    (double2)(0x1.250bfe0000000p+0, 0x1.b082f4f9b8d4cp-28),
+    (double2)(0x1.254d0e0000000p+0, 0x1.876e0e5527f5ap-25),
+    (double2)(0x1.258e020000000p+0, 0x1.3617080831e6bp-25),
+    (double2)(0x1.25ced90000000p+0, 0x1.81b26e34aa4a2p-25),
+    (double2)(0x1.260f940000000p+0, 0x1.52ee66dfab0c1p-26),
+    (double2)(0x1.2650320000000p+0, 0x1.d85a5329e8819p-26),
+    (double2)(0x1.2690b40000000p+0, 0x1.105c1b646b5d1p-26),
+    (double2)(0x1.26d1190000000p+0, 0x1.bb6690c1a379cp-25),
+    (double2)(0x1.2711630000000p+0, 0x1.86aeba73ce3a9p-26),
+    (double2)(0x1.2751900000000p+0, 0x1.dd16198294dd4p-25),
+    (double2)(0x1.2791a20000000p+0, 0x1.454e675775e83p-25),
+    (double2)(0x1.27d1980000000p+0, 0x1.3842e026197eap-25),
+    (double2)(0x1.2811720000000p+0, 0x1.f1ce0e70c44d2p-25),
+    (double2)(0x1.2851310000000p+0, 0x1.ad636441a5627p-25),
+    (double2)(0x1.2890d50000000p+0, 0x1.4c205d7212abbp-26),
+    (double2)(0x1.28d05d0000000p+0, 0x1.167c86c116419p-25),
+    (double2)(0x1.290fca0000000p+0, 0x1.38ec3ef16e294p-25),
+    (double2)(0x1.294f1c0000000p+0, 0x1.473fceace9321p-25),
+    (double2)(0x1.298e530000000p+0, 0x1.7af53a836dba7p-25),
+    (double2)(0x1.29cd700000000p+0, 0x1.a51f3c383b652p-30),
+    (double2)(0x1.2a0c710000000p+0, 0x1.3696da190822dp-25),
+    (double2)(0x1.2a4b580000000p+0, 0x1.2f9adec77074bp-25),
+    (double2)(0x1.2a8a250000000p+0, 0x1.8190fd5bee55fp-28),
+    (double2)(0x1.2ac8d70000000p+0, 0x1.bfee8fac68e55p-27),
+    (double2)(0x1.2b076f0000000p+0, 0x1.31c9d6bc5f68ap-28),
+    (double2)(0x1.2b45ec0000000p+0, 0x1.89d0523737edfp-25),
+    (double2)(0x1.2b84500000000p+0, 0x1.a295943bf47bbp-26),
+    (double2)(0x1.2bc29a0000000p+0, 0x1.96be32e5b3207p-28),
+    (double2)(0x1.2c00c90000000p+0, 0x1.e44c7d909fa0ep-25),
+    (double2)(0x1.2c3ee00000000p+0, 0x1.b2505da94d9eap-29),
+    (double2)(0x1.2c7cdc0000000p+0, 0x1.0c851f46c9c98p-25),
+    (double2)(0x1.2cbabf0000000p+0, 0x1.da71f7d9aa3b7p-26),
+    (double2)(0x1.2cf8880000000p+0, 0x1.f1b605d019ef1p-25),
+    (double2)(0x1.2d36390000000p+0, 0x1.386e8a2189563p-27),
+    (double2)(0x1.2d73d00000000p+0, 0x1.b19fa5d306ba7p-28),
+    (double2)(0x1.2db14d0000000p+0, 0x1.dd749b67aef76p-25),
+    (double2)(0x1.2deeb20000000p+0, 0x1.76ff6f1dc04b0p-25),
+    (double2)(0x1.2e2bfe0000000p+0, 0x1.35a33d0b232a6p-25),
+    (double2)(0x1.2e69310000000p+0, 0x1.4bdc80024a4e1p-25),
+    (double2)(0x1.2ea64b0000000p+0, 0x1.ebd61770fd723p-25),
+    (double2)(0x1.2ee34d0000000p+0, 0x1.4769fc537264dp-25),
+    (double2)(0x1.2f20360000000p+0, 0x1.9021f429f3b98p-25),
+    (double2)(0x1.2f5d070000000p+0, 0x1.ee7083efbd606p-26),
+    (double2)(0x1.2f99bf0000000p+0, 0x1.ad985552a6b1ap-25),
+    (double2)(0x1.2fd65f0000000p+0, 0x1.e3df778772160p-25),
+    (double2)(0x1.3012e70000000p+0, 0x1.ca5d76ddc9b34p-25),
+    (double2)(0x1.304f570000000p+0, 0x1.91154ffdbaf74p-25),
+    (double2)(0x1.308baf0000000p+0, 0x1.67bdd57fb306ap-25),
+    (double2)(0x1.30c7ef0000000p+0, 0x1.7dc255ac40886p-25),
+    (double2)(0x1.3104180000000p+0, 0x1.219f38e8afafep-32),
+    (double2)(0x1.3140280000000p+0, 0x1.2416bf9669a04p-25),
+    (double2)(0x1.317c210000000p+0, 0x1.11c96b2b3987fp-25),
+    (double2)(0x1.31b8020000000p+0, 0x1.f99ed447e1177p-25),
+    (double2)(0x1.31f3cd0000000p+0, 0x1.3245826328a11p-30),
+    (double2)(0x1.322f7f0000000p+0, 0x1.6f56dd1e645f8p-25),
+    (double2)(0x1.326b1b0000000p+0, 0x1.6164946945535p-27),
+    (double2)(0x1.32a69f0000000p+0, 0x1.e37d59d190028p-26),
+    (double2)(0x1.32e20c0000000p+0, 0x1.68671f12bf828p-25),
+    (double2)(0x1.331d620000000p+0, 0x1.e8ecbca6aabbdp-25),
+    (double2)(0x1.3358a20000000p+0, 0x1.3f49e109a5912p-26),
+    (double2)(0x1.3393ca0000000p+0, 0x1.b8a0e11ec3043p-25),
+    (double2)(0x1.33cedc0000000p+0, 0x1.5fae00aed691ap-25),
+    (double2)(0x1.3409d70000000p+0, 0x1.c0569bece3e4ap-25),
+    (double2)(0x1.3444bc0000000p+0, 0x1.05e26744efbfep-25),
+    (double2)(0x1.347f8a0000000p+0, 0x1.5b570a94be5c5p-25),
+    (double2)(0x1.34ba420000000p+0, 0x1.d6f156ea0e063p-26),
+    (double2)(0x1.34f4e30000000p+0, 0x1.e0ca7612fc484p-25),
+    (double2)(0x1.352f6f0000000p+0, 0x1.963c927b25258p-27),
+    (double2)(0x1.3569e40000000p+0, 0x1.47930aa725a5cp-26),
+    (double2)(0x1.35a4430000000p+0, 0x1.8a79fe3af43b3p-26),
+    (double2)(0x1.35de8c0000000p+0, 0x1.e6dc29c41bdafp-26),
+    (double2)(0x1.3618bf0000000p+0, 0x1.57a2e76f863a5p-25),
+    (double2)(0x1.3652dd0000000p+0, 0x1.ae3b61716354dp-29),
+    (double2)(0x1.368ce40000000p+0, 0x1.65fb5df6906b1p-25),
+    (double2)(0x1.36c6d60000000p+0, 0x1.6177d7f588f7bp-25),
+    (double2)(0x1.3700b30000000p+0, 0x1.ad55abd091b67p-28),
+    (double2)(0x1.373a7a0000000p+0, 0x1.55337b2422d76p-30),
+    (double2)(0x1.37742b0000000p+0, 0x1.084ebe86972d5p-25),
+    (double2)(0x1.37adc70000000p+0, 0x1.56395808e1ea3p-25),
+    (double2)(0x1.37e74e0000000p+0, 0x1.1bce21b40fba7p-25),
+    (double2)(0x1.3820c00000000p+0, 0x1.006f94605b515p-26),
+    (double2)(0x1.385a1c0000000p+0, 0x1.aa676aceb1f7dp-25),
+    (double2)(0x1.3893640000000p+0, 0x1.8229f76554ce6p-26),
+    (double2)(0x1.38cc960000000p+0, 0x1.eabfc6cf57330p-25),
+    (double2)(0x1.3905b40000000p+0, 0x1.4daed9c0ce8bcp-25),
+    (double2)(0x1.393ebd0000000p+0, 0x1.0ff1768237141p-25),
+    (double2)(0x1.3977b10000000p+0, 0x1.575f83051b085p-25),
+    (double2)(0x1.39b0910000000p+0, 0x1.2667deb523e29p-27),
+    (double2)(0x1.39e95c0000000p+0, 0x1.816996954f4fdp-30),
+    (double2)(0x1.3a22120000000p+0, 0x1.87cfccf4d9cd4p-26),
+    (double2)(0x1.3a5ab40000000p+0, 0x1.2c5d018198353p-26),
+    (double2)(0x1.3a93410000000p+0, 0x1.a7a898dcc34aap-25),
+    (double2)(0x1.3acbbb0000000p+0, 0x1.cead6dadc36d1p-29),
+    (double2)(0x1.3b04200000000p+0, 0x1.a55759c498bdfp-29),
+    (double2)(0x1.3b3c700000000p+0, 0x1.c414a9ef6de04p-25),
+    (double2)(0x1.3b74ad0000000p+0, 0x1.3e2108a6e58fap-25),
+    (double2)(0x1.3bacd60000000p+0, 0x1.587fd7643d77cp-26),
+    (double2)(0x1.3be4eb0000000p+0, 0x1.901eb1d3ff3dfp-28),
+    (double2)(0x1.3c1ceb0000000p+0, 0x1.f2ccd7c812fc6p-25),
+    (double2)(0x1.3c54d90000000p+0, 0x1.1c8ee70a01049p-29),
+    (double2)(0x1.3c8cb20000000p+0, 0x1.63e8d02831eecp-26),
+    (double2)(0x1.3cc4770000000p+0, 0x1.f61a42a92c7ffp-25),
+    (double2)(0x1.3cfc2a0000000p+0, 0x1.a917399c84d24p-34),
+    (double2)(0x1.3d33c80000000p+0, 0x1.e9197c8eec2f0p-26),
+    (double2)(0x1.3d6b530000000p+0, 0x1.e6f842f5a1378p-26),
+    (double2)(0x1.3da2cb0000000p+0, 0x1.fac242a90a0fcp-29),
+    (double2)(0x1.3dda2f0000000p+0, 0x1.35ed726610227p-26),
+    (double2)(0x1.3e11800000000p+0, 0x1.0e0d64804b15bp-26),
+    (double2)(0x1.3e48be0000000p+0, 0x1.560675daba814p-31),
+    (double2)(0x1.3e7fe80000000p+0, 0x1.37388c8768032p-25),
+    (double2)(0x1.3eb7000000000p+0, 0x1.ee3c89f9e01f5p-28),
+    (double2)(0x1.3eee040000000p+0, 0x1.39f6f0d09747cp-25),
+    (double2)(0x1.3f24f60000000p+0, 0x1.322c327abb8f0p-27),
+    (double2)(0x1.3f5bd40000000p+0, 0x1.961b347c8ac80p-25),
+    (double2)(0x1.3f92a00000000p+0, 0x1.3711fbbd0f118p-25),
+    (double2)(0x1.3fc9590000000p+0, 0x1.4fad8d7718ffbp-25),
+    (double2)(0x1.3fffff0000000p+0, 0x1.fffffffffffffp-25),
+    (double2)(0x1.4036930000000p+0, 0x1.67efa79ec35b4p-25),
+    (double2)(0x1.406d140000000p+0, 0x1.a737687a254a8p-25),
+    (double2)(0x1.40a3830000000p+0, 0x1.bace0f87d924dp-26),
+    (double2)(0x1.40d9df0000000p+0, 0x1.29e37c237e392p-25),
+    (double2)(0x1.4110290000000p+0, 0x1.57ce7ac3f3012p-26),
+    (double2)(0x1.4146600000000p+0, 0x1.82829359f8fbdp-25),
+    (double2)(0x1.417c850000000p+0, 0x1.cc9be42d14676p-25),
+    (double2)(0x1.41b2980000000p+0, 0x1.a8f001c137d0bp-25),
+    (double2)(0x1.41e8990000000p+0, 0x1.36127687dda05p-25),
+    (double2)(0x1.421e880000000p+0, 0x1.24dba322646f0p-26),
+    (double2)(0x1.4254640000000p+0, 0x1.dc43f1ed210b4p-25),
+    (double2)(0x1.428a2f0000000p+0, 0x1.31ae515c447bbp-25),
+)
+
+DECLARE_TABLE(double2, CBRT_TBL_REM, 5,
+    (double2)(0x1.428a2f0000000p-1, 0x1.31ae515c447bbp-26),
+    (double2)(0x1.965fea0000000p-1, 0x1.4f5b8f20ac166p-27),
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.428a2f0000000p+0, 0x1.31ae515c447bbp-25),
+    (double2)(0x1.965fea0000000p+0, 0x1.4f5b8f20ac166p-26),
+)
+

diff --git a/amd-builtins/math64/ceilD.cl b/amd-builtins/math64/ceilD.cl
new file mode 100644
index 0000000..272d589
--- /dev/null
+++ b/amd-builtins/math64/ceilD.cl

@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__ ((overloadable, always_inline)) double
+ceil(double x)
+{
+    return __amdil_round_posinf_f64(x);
+}
+

diff --git a/amd-builtins/math64/copysignD.cl b/amd-builtins/math64/copysignD.cl
new file mode 100644
index 0000000..818e5f7
--- /dev/null
+++ b/amd-builtins/math64/copysignD.cl

@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+// __hsail_ intrinsic which has no __amdil_ equivalent.
+extern __attribute__((pure)) double  __hsail_copysign_f64(double, double);
+
+__attribute__((overloadable, always_inline)) double
+copysign(double x, double y)
+{
+    return __hsail_copysign_f64(x, y);
+}
+

diff --git a/amd-builtins/math64/cosD.cl b/amd-builtins/math64/cosD.cl
new file mode 100644
index 0000000..74cabec
--- /dev/null
+++ b/amd-builtins/math64/cosD.cl

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+#include "sincosD_piby4.h"
+#include "remainderD_piby2.h"
+
+__attribute__((overloadable, always_inline, pure, weak)) double
+cos(double x)
+{
+    x = fabs(x);
+
+    double r, rr;
+    int regn;
+
+    if (x < 0x1.0p+47)
+        remainder_piby2_medium(x, &r, &rr, &regn);
+    else 
+        remainder_piby2_large(x, &r, &rr, &regn);
+
+    double2 sc = sincos_piby4(r, rr);
+    sc.lo = -sc.lo;
+
+    int2 c = as_int2(regn & 1 ? sc.lo : sc.hi);
+    c.hi ^= (regn > 1) << 31;
+
+    return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c);
+}
+

diff --git a/amd-builtins/math64/coshD.cl b/amd-builtins/math64/coshD.cl
new file mode 100644
index 0000000..bc4a4fa
--- /dev/null
+++ b/amd-builtins/math64/coshD.cl

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+cosh(double x)
+{
+    USE_TABLE(double2, sinh_tbl, SINH_TBL);
+    USE_TABLE(double2, cosh_tbl, COSH_TBL);
+
+    // After dealing with special cases the computation is split into
+    // regions as follows:
+    //
+    // abs(x) >= max_cosh_arg:
+    // cosh(x) = sign(x)*Inf
+    //
+    // abs(x) >= small_threshold:
+    // cosh(x) = sign(x)*exp(abs(x))/2 computed using the
+    // splitexp and scaleDouble functions as for exp_amd().
+    //
+    // abs(x) < small_threshold:
+    // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    // cosh(x) is then sign(x)*z.                             */
+
+    // This is ln(2^1025)
+    const double max_cosh_arg = 7.10475860073943977113e+02;      /* 0x408633ce8fb9f87e */
+
+    // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27)
+    const double small_threshold = 0x1.2b708872320e2p+4;
+
+    double y = fabs(x);
+
+    // In this range we find the integer part y0 of y 
+    // and the increment dy = y - y0. We then compute
+    // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
+    // where sinh(y0) and cosh(y0) are tabulated above.
+
+    int ind = min((int)y, 36);
+    double dy = y - ind;
+    double dy2 = dy * dy;
+
+    double sdy = dy * dy2 *
+	         fma(dy2,
+		     fma(dy2,
+			 fma(dy2,
+			     fma(dy2,
+				 fma(dy2,
+				     fma(dy2, 0.7746188980094184251527126e-12, 0.160576793121939886190847e-9),
+				     0.250521176994133472333666e-7),
+				 0.275573191913636406057211e-5),
+			     0.198412698413242405162014e-3),
+			 0.833333333333329931873097e-2),
+		     0.166666666666666667013899e0);
+
+    double cdy = dy2 * fma(dy2,
+	                   fma(dy2,
+			       fma(dy2,
+				   fma(dy2,
+				       fma(dy2,
+					   fma(dy2, 0.1163921388172173692062032e-10, 0.208744349831471353536305e-8),
+					   0.275573350756016588011357e-6),
+				       0.248015872460622433115785e-4),
+				   0.138888888889814854814536e-2),
+			       0.416666666666660876512776e-1),
+			   0.500000000000000005911074e0);
+
+    // At this point sinh(dy) is approximated by dy + sdy,
+    // and cosh(dy) is approximated by 1 + cdy.
+    double2 tv = cosh_tbl[ind];
+    double cl = tv.s0;
+    double ct = tv.s1;
+    tv = sinh_tbl[ind];
+    double sl = tv.s0;
+    double st = tv.s1;
+
+    double z = fma(sl, dy, fma(sl, sdy, fma(cl, cdy, fma(st, dy, fma(st, sdy, ct*cdy)) + ct))) + cl;
+
+    // Other cases
+    z = y < 0x1.0p-28 ? 1.0 : z;
+
+    double t = exp(y - 0x1.62e42fefa3800p-1);
+    t =  fma(t, -0x1.ef35793c76641p-45, t);
+    z = y >= small_threshold ? t : z;
+
+    z = y >= max_cosh_arg ? as_double(PINFBITPATT_DP64) : z;
+
+    z = isinf(x) | isnan(x) ? y : z;
+
+    return z;
+}
+

diff --git a/amd-builtins/math64/cospiD.cl b/amd-builtins/math64/cospiD.cl
new file mode 100644
index 0000000..0297cb0
--- /dev/null
+++ b/amd-builtins/math64/cospiD.cl

@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+#include "sincosD_piby4.h"
+
+__attribute__((overloadable)) double
+cospi(double x)
+{
+    const double pi = 3.1415926535897932384626433832795;
+
+    long ix = as_long(x) & 0x7fffffffffffffffL; 
+    double ax = as_double(ix);
+    long iax = (long)ax;
+    double r = ax - (double)iax;
+    long xodd = iax & 0x1L ? 0x8000000000000000L : 0L;
+
+    // Initialize with return for +-Inf and NaN
+    long ir = 0x7ff8000000000000L;
+
+    // 2^53 <= |x| < Inf, the result is always even integer
+    ir = ix < 0x7ff0000000000000 ? 0x3ff0000000000000L : ir;
+
+    // 2^52 <= |x| < 2^53, the result is always integer
+    ir = ax < 0x1.0p+53 ? xodd | 0x3ff0000000000000L : ir;
+
+    // 0x1.0p-7 <= |x| < 2^52, result depends on which 0.25 interval
+
+    // r < 1.0
+    double a = 1.0 - r;
+    int e = 1;
+    long s = xodd ^ 0x8000000000000000L;
+
+    // r <= 0.75
+    int c = r <= 0.75;
+    double t = r - 0.5;
+    a = c ? t : a;
+    e = c ? 0 : e;
+
+    // r < 0.5
+    c = r < 0.5;
+    t = 0.5 - r;
+    a = c ? t : a;
+    s = c ? xodd : s;
+
+    // r <= 0.25
+    c = r <= 0.25;
+    a = c ? r : a;
+    e = c ? 1 : e;
+
+    double2 sc = sincos_piby4(a * pi, 0.0);
+    long jr = s ^ as_long(e ? sc.hi : sc.lo);
+
+    ir = ax < 0x1.0p+52 ? jr : ir;
+
+    return as_double(ir);
+}
+

diff --git a/amd-builtins/math64/ep_logD.h b/amd-builtins/math64/ep_logD.h
new file mode 100644
index 0000000..1ba9b15
--- /dev/null
+++ b/amd-builtins/math64/ep_logD.h

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define LN0 8.33333333333317923934e-02
+#define LN1 1.25000000037717509602e-02
+#define LN2 2.23213998791944806202e-03
+#define LN3 4.34887777707614552256e-04
+
+#define LF0 8.33333333333333593622e-02
+#define LF1 1.24999999978138668903e-02
+#define LF2 2.23219810758559851206e-03
+
+static inline void
+ep_log(double x, int *xexp, double *r1, double *r2)
+{
+    USE_TABLE(double2, p_tbl, LN_TBL);
+
+    // Computes natural log(x). Algorithm based on:
+    // Ping-Tak Peter Tang
+    // "Table-driven implementation of the logarithm function in IEEE
+    // floating-point arithmetic"
+    // ACM Transactions on Mathematical Software (TOMS)
+    // Volume 16, Issue 4 (December 1990)
+    int near_one = x >= 0x1.e0faap-1 & x <= 0x1.1082cp+0;
+
+    ulong ux = as_ulong(x);
+    ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962);
+    int c = ux < IMPBIT_DP64;
+    ux = c ? uxs : ux;
+    int expadjust = c ? 60 : 0;
+
+    // Store the exponent of x in xexp and put f into the range [0.5,1)
+    int xexp1 = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust;
+    double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64));
+    *xexp = near_one ? 0 : xexp1;
+
+    double r = x - 1.0;
+    double u1 = MATH_DIVIDE(r, 2.0 + r);
+    double ru1 = -r * u1;
+    u1 = u1 + u1;
+
+    int index = as_int2(ux).hi >> 13;
+    index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1);
+
+    double f1 = index * 0x1.0p-7;
+    double f2 = f - f1;
+    double u2 = MATH_DIVIDE(f2, fma(0.5, f2, f1));
+
+    double2 tv = p_tbl[index - 64];
+    double z1 = tv.s0;
+    double q = tv.s1;
+
+    z1 = near_one ? r : z1;
+    q = near_one ? 0.0 : q;
+    double u = near_one ? u1 : u2;
+    double v = u*u;
+
+    double cc = near_one ? ru1 : u2;
+
+    double z21 = fma(v, fma(v, fma(v, LN3, LN2), LN1), LN0);
+    double z22 = fma(v, fma(v, LF2, LF1), LF0);
+    double z2 = near_one ? z21 : z22;
+    z2 = fma(u*v, z2, cc) + q;
+
+    *r1 = z1;
+    *r2 = z2;
+}
+

diff --git a/amd-builtins/math64/erfD.cl b/amd-builtins/math64/erfD.cl
new file mode 100644
index 0000000..1efa936
--- /dev/null
+++ b/amd-builtins/math64/erfD.cl

@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+/* double erf(double x)
+ * double erfc(double x)
+ *                             x
+ *                      2      |\
+ *     erf(x)  =  ---------  | exp(-t*t)dt
+ *                    sqrt(pi) \|
+ *                             0
+ *
+ *     erfc(x) =  1-erf(x)
+ *  Note that
+ *                erf(-x) = -erf(x)
+ *                erfc(-x) = 2 - erfc(x)
+ *
+ * Method:
+ *        1. For |x| in [0, 0.84375]
+ *            erf(x)  = x + x*R(x^2)
+ *          erfc(x) = 1 - erf(x)           if x in [-.84375,0.25]
+ *                  = 0.5 + ((0.5-x)-x*R)  if x in [0.25,0.84375]
+ *           where R = P/Q where P is an odd poly of degree 8 and
+ *           Q is an odd poly of degree 10.
+ *                                                 -57.90
+ *                        | R - (erf(x)-x)/x | <= 2
+ *
+ *
+ *           Remark. The formula is derived by noting
+ *          erf(x) = (2/sqrt(pi))*(x - x^3/3 + x^5/10 - x^7/42 + ....)
+ *           and that
+ *          2/sqrt(pi) = 1.128379167095512573896158903121545171688
+ *           is close to one. The interval is chosen because the fix
+ *           point of erf(x) is near 0.6174 (i.e., erf(x)=x when x is
+ *           near 0.6174), and by some experiment, 0.84375 is chosen to
+ *            guarantee the error is less than one ulp for erf.
+ *
+ *      2. For |x| in [0.84375,1.25], let s = |x| - 1, and
+ *         c = 0.84506291151 rounded to single (24 bits)
+ *                 erf(x)  = sign(x) * (c  + P1(s)/Q1(s))
+ *                 erfc(x) = (1-c)  - P1(s)/Q1(s) if x > 0
+ *                          1+(c+P1(s)/Q1(s))    if x < 0
+ *                 |P1/Q1 - (erf(|x|)-c)| <= 2**-59.06
+ *           Remark: here we use the taylor series expansion at x=1.
+ *                erf(1+s) = erf(1) + s*Poly(s)
+ *                         = 0.845.. + P1(s)/Q1(s)
+ *           That is, we use rational approximation to approximate
+ *                        erf(1+s) - (c = (single)0.84506291151)
+ *           Note that |P1/Q1|< 0.078 for x in [0.84375,1.25]
+ *           where
+ *                P1(s) = degree 6 poly in s
+ *                Q1(s) = degree 6 poly in s
+ *
+ *      3. For x in [1.25,1/0.35(~2.857143)],
+ *                 erfc(x) = (1/x)*exp(-x*x-0.5625+R1/S1)
+ *                 erf(x)  = 1 - erfc(x)
+ *           where
+ *                R1(z) = degree 7 poly in z, (z=1/x^2)
+ *                S1(z) = degree 8 poly in z
+ *
+ *      4. For x in [1/0.35,28]
+ *                 erfc(x) = (1/x)*exp(-x*x-0.5625+R2/S2) if x > 0
+ *                        = 2.0 - (1/x)*exp(-x*x-0.5625+R2/S2) if -6<x<0
+ *                        = 2.0 - tiny                (if x <= -6)
+ *                 erf(x)  = sign(x)*(1.0 - erfc(x)) if x < 6, else
+ *                 erf(x)  = sign(x)*(1.0 - tiny)
+ *           where
+ *                R2(z) = degree 6 poly in z, (z=1/x^2)
+ *                S2(z) = degree 7 poly in z
+ *
+ *      Note1:
+ *           To compute exp(-x*x-0.5625+R/S), let s be a single
+ *           precision number and s := x; then
+ *                -x*x = -s*s + (s-x)*(s+x)
+ *                exp(-x*x-0.5626+R/S) =
+ *                        exp(-s*s-0.5625)*exp((s-x)*(s+x)+R/S);
+ *      Note2:
+ *           Here 4 and 5 make use of the asymptotic series
+ *                          exp(-x*x)
+ *                erfc(x) ~ ---------- * ( 1 + Poly(1/x^2) )
+ *                          x*sqrt(pi)
+ *           We use rational approximation to approximate
+ *              g(s)=f(1/x^2) = log(erfc(x)*x) - x*x + 0.5625
+ *           Here is the error bound for R1/S1 and R2/S2
+ *              |R1/S1 - f(x)|  < 2**(-62.57)
+ *              |R2/S2 - f(x)|  < 2**(-61.52)
+ *
+ *      5. For inf > x >= 28
+ *                 erf(x)  = sign(x) *(1 - tiny)  (raise inexact)
+ *                 erfc(x) = tiny*tiny (raise underflow) if x > 0
+ *                        = 2 - tiny if x<0
+ *
+ *      7. Special case:
+ *                 erf(0)  = 0, erf(inf)  = 1, erf(-inf) = -1,
+ *                 erfc(0) = 1, erfc(inf) = 0, erfc(-inf) = 2,
+ *                   erfc/erf(NaN) is NaN
+ */
+
+#define AU0 -9.86494292470009928597e-03
+#define AU1 -7.99283237680523006574e-01
+#define AU2 -1.77579549177547519889e+01
+#define AU3 -1.60636384855821916062e+02
+#define AU4 -6.37566443368389627722e+02
+#define AU5 -1.02509513161107724954e+03
+#define AU6 -4.83519191608651397019e+02
+
+#define AV1  3.03380607434824582924e+01
+#define AV2  3.25792512996573918826e+02
+#define AV3  1.53672958608443695994e+03
+#define AV4  3.19985821950859553908e+03
+#define AV5  2.55305040643316442583e+03
+#define AV6  4.74528541206955367215e+02
+#define AV7 -2.24409524465858183362e+01
+
+#define BU0 -9.86494403484714822705e-03
+#define BU1 -6.93858572707181764372e-01
+#define BU2 -1.05586262253232909814e+01
+#define BU3 -6.23753324503260060396e+01
+#define BU4 -1.62396669462573470355e+02
+#define BU5 -1.84605092906711035994e+02
+#define BU6 -8.12874355063065934246e+01
+#define BU7 -9.81432934416914548592e+00
+
+#define BV1  1.96512716674392571292e+01
+#define BV2  1.37657754143519042600e+02
+#define BV3  4.34565877475229228821e+02
+#define BV4  6.45387271733267880336e+02
+#define BV5  4.29008140027567833386e+02
+#define BV6  1.08635005541779435134e+02
+#define BV7  6.57024977031928170135e+00
+#define BV8 -6.04244152148580987438e-02
+
+#define CU0 -2.36211856075265944077e-03
+#define CU1  4.14856118683748331666e-01
+#define CU2 -3.72207876035701323847e-01
+#define CU3  3.18346619901161753674e-01
+#define CU4 -1.10894694282396677476e-01
+#define CU5  3.54783043256182359371e-02
+#define CU6 -2.16637559486879084300e-03
+
+#define CV1  1.06420880400844228286e-01
+#define CV2  5.40397917702171048937e-01
+#define CV3  7.18286544141962662868e-02
+#define CV4  1.26171219808761642112e-01
+#define CV5  1.36370839120290507362e-02
+#define CV6  1.19844998467991074170e-02
+
+#define DU0  1.28379167095512558561e-01
+#define DU1 -3.25042107247001499370e-01
+#define DU2 -2.84817495755985104766e-02
+#define DU3 -5.77027029648944159157e-03
+#define DU4 -2.37630166566501626084e-05
+
+#define DV1  3.97917223959155352819e-01
+#define DV2  6.50222499887672944485e-02
+#define DV3  5.08130628187576562776e-03
+#define DV4  1.32494738004321644526e-04
+#define DV5 -3.96022827877536812320e-06
+
+__attribute__((overloadable)) double
+erf(double y)
+{
+    double x = fabs(y);
+    double x2 = x * x;
+    double xm1 = x - 1.0;
+
+    // Poly variable
+    double t = 1.0 / x2;
+    t = x < 1.25 ? xm1 : t;
+    t = x < 0.84375 ? x2 : t;
+
+    double u, ut, v, vt;
+
+    // Evaluate rational poly
+    // XXX We need to see of we can grab 16 coefficents from a table
+    // faster than evaluating 3 of the poly pairs
+    // if (x < 6.0)
+    u = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AU6, AU5), AU4), AU3), AU2), AU1), AU0);
+    v = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AV7, AV6), AV5), AV4), AV3), AV2), AV1);
+
+    ut = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BU7, BU6), BU5), BU4), BU3), BU2), BU1), BU0);
+    vt = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BV8, BV7), BV6), BV5), BV4), BV3), BV2), BV1);
+    u = x < 0x1.6db6ep+1 ? ut : u;
+    v = x < 0x1.6db6ep+1 ? vt : v;
+
+    ut = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, CU6, CU5), CU4), CU3), CU2), CU1), CU0);
+    vt = fma(t, fma(t, fma(t, fma(t, fma(t, CV6, CV5), CV4), CV3), CV2), CV1);
+    u = x < 1.25 ? ut : u;
+    v = x < 1.25 ? vt : v;
+
+    ut = fma(t, fma(t, fma(t, fma(t, DU4, DU3), DU2), DU1), DU0);
+    vt = fma(t, fma(t, fma(t, fma(t, DV5, DV4), DV3), DV2), DV1);
+    u = x < 0.84375 ? ut : u;
+    v = x < 0.84375 ? vt : v;
+
+    v = fma(t, v, 1.0);
+
+    // Compute rational approximation
+    double q = u / v;
+
+    // Compute results
+    double z = as_double(as_long(x) & 0xffffffff00000000L);
+    double r = exp(-z * z - 0.5625) * exp((z - x) * (z + x) + q);
+    r = 1.0 - r / x;
+
+    double ret = x < 6.0 ? r : 1.0;
+
+    r = 8.45062911510467529297e-01 + q;
+    ret = x < 1.25 ? r : ret;
+
+    q = x < 0x1.0p-28 ? 1.28379167095512586316e-01 : q;
+
+    r = fma(x, q, x);
+    ret = x < 0.84375 ? r : ret;
+
+    ret = isnan(x) ? x : ret;
+
+    return y < 0.0 ? -ret : ret;
+}
+

diff --git a/amd-builtins/math64/erfcD.cl b/amd-builtins/math64/erfcD.cl
new file mode 100644
index 0000000..5321224
--- /dev/null
+++ b/amd-builtins/math64/erfcD.cl

@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+/* double erf(double x)
+ * double erfc(double x)
+ *                             x
+ *                      2      |\
+ *     erf(x)  =  ---------  | exp(-t*t)dt
+ *                    sqrt(pi) \|
+ *                             0
+ *
+ *     erfc(x) =  1-erf(x)
+ *  Note that
+ *                erf(-x) = -erf(x)
+ *                erfc(-x) = 2 - erfc(x)
+ *
+ * Method:
+ *        1. For |x| in [0, 0.84375]
+ *            erf(x)  = x + x*R(x^2)
+ *          erfc(x) = 1 - erf(x)           if x in [-.84375,0.25]
+ *                  = 0.5 + ((0.5-x)-x*R)  if x in [0.25,0.84375]
+ *           where R = P/Q where P is an odd poly of degree 8 and
+ *           Q is an odd poly of degree 10.
+ *                                                 -57.90
+ *                        | R - (erf(x)-x)/x | <= 2
+ *
+ *
+ *           Remark. The formula is derived by noting
+ *          erf(x) = (2/sqrt(pi))*(x - x^3/3 + x^5/10 - x^7/42 + ....)
+ *           and that
+ *          2/sqrt(pi) = 1.128379167095512573896158903121545171688
+ *           is close to one. The interval is chosen because the fix
+ *           point of erf(x) is near 0.6174 (i.e., erf(x)=x when x is
+ *           near 0.6174), and by some experiment, 0.84375 is chosen to
+ *            guarantee the error is less than one ulp for erf.
+ *
+ *      2. For |x| in [0.84375,1.25], let s = |x| - 1, and
+ *         c = 0.84506291151 rounded to single (24 bits)
+ *                 erf(x)  = sign(x) * (c  + P1(s)/Q1(s))
+ *                 erfc(x) = (1-c)  - P1(s)/Q1(s) if x > 0
+ *                          1+(c+P1(s)/Q1(s))    if x < 0
+ *                 |P1/Q1 - (erf(|x|)-c)| <= 2**-59.06
+ *           Remark: here we use the taylor series expansion at x=1.
+ *                erf(1+s) = erf(1) + s*Poly(s)
+ *                         = 0.845.. + P1(s)/Q1(s)
+ *           That is, we use rational approximation to approximate
+ *                        erf(1+s) - (c = (single)0.84506291151)
+ *           Note that |P1/Q1|< 0.078 for x in [0.84375,1.25]
+ *           where
+ *                P1(s) = degree 6 poly in s
+ *                Q1(s) = degree 6 poly in s
+ *
+ *      3. For x in [1.25,1/0.35(~2.857143)],
+ *                 erfc(x) = (1/x)*exp(-x*x-0.5625+R1/S1)
+ *                 erf(x)  = 1 - erfc(x)
+ *           where
+ *                R1(z) = degree 7 poly in z, (z=1/x^2)
+ *                S1(z) = degree 8 poly in z
+ *
+ *      4. For x in [1/0.35,28]
+ *                 erfc(x) = (1/x)*exp(-x*x-0.5625+R2/S2) if x > 0
+ *                        = 2.0 - (1/x)*exp(-x*x-0.5625+R2/S2) if -6<x<0
+ *                        = 2.0 - tiny                (if x <= -6)
+ *                 erf(x)  = sign(x)*(1.0 - erfc(x)) if x < 6, else
+ *                 erf(x)  = sign(x)*(1.0 - tiny)
+ *           where
+ *                R2(z) = degree 6 poly in z, (z=1/x^2)
+ *                S2(z) = degree 7 poly in z
+ *
+ *      Note1:
+ *           To compute exp(-x*x-0.5625+R/S), let s be a single
+ *           precision number and s := x; then
+ *                -x*x = -s*s + (s-x)*(s+x)
+ *                exp(-x*x-0.5626+R/S) =
+ *                        exp(-s*s-0.5625)*exp((s-x)*(s+x)+R/S);
+ *      Note2:
+ *           Here 4 and 5 make use of the asymptotic series
+ *                          exp(-x*x)
+ *                erfc(x) ~ ---------- * ( 1 + Poly(1/x^2) )
+ *                          x*sqrt(pi)
+ *           We use rational approximation to approximate
+ *              g(s)=f(1/x^2) = log(erfc(x)*x) - x*x + 0.5625
+ *           Here is the error bound for R1/S1 and R2/S2
+ *              |R1/S1 - f(x)|  < 2**(-62.57)
+ *              |R2/S2 - f(x)|  < 2**(-61.52)
+ *
+ *      5. For inf > x >= 28
+ *                 erf(x)  = sign(x) *(1 - tiny)  (raise inexact)
+ *                 erfc(x) = tiny*tiny (raise underflow) if x > 0
+ *                        = 2 - tiny if x<0
+ *
+ *      7. Special case:
+ *                 erf(0)  = 0, erf(inf)  = 1, erf(-inf) = -1,
+ *                 erfc(0) = 1, erfc(inf) = 0, erfc(-inf) = 2,
+ *                   erfc/erf(NaN) is NaN
+ */
+
+#define AU0 -9.86494292470009928597e-03
+#define AU1 -7.99283237680523006574e-01
+#define AU2 -1.77579549177547519889e+01
+#define AU3 -1.60636384855821916062e+02
+#define AU4 -6.37566443368389627722e+02
+#define AU5 -1.02509513161107724954e+03
+#define AU6 -4.83519191608651397019e+02
+
+#define AV0  3.03380607434824582924e+01
+#define AV1  3.25792512996573918826e+02
+#define AV2  1.53672958608443695994e+03
+#define AV3  3.19985821950859553908e+03
+#define AV4  2.55305040643316442583e+03
+#define AV5  4.74528541206955367215e+02
+#define AV6 -2.24409524465858183362e+01
+
+#define BU0 -9.86494403484714822705e-03
+#define BU1 -6.93858572707181764372e-01
+#define BU2 -1.05586262253232909814e+01
+#define BU3 -6.23753324503260060396e+01
+#define BU4 -1.62396669462573470355e+02
+#define BU5 -1.84605092906711035994e+02
+#define BU6 -8.12874355063065934246e+01
+#define BU7 -9.81432934416914548592e+00
+
+#define BV0  1.96512716674392571292e+01
+#define BV1  1.37657754143519042600e+02
+#define BV2  4.34565877475229228821e+02
+#define BV3  6.45387271733267880336e+02
+#define BV4  4.29008140027567833386e+02
+#define BV5  1.08635005541779435134e+02
+#define BV6  6.57024977031928170135e+00
+#define BV7 -6.04244152148580987438e-02
+
+#define CU0 -2.36211856075265944077e-03
+#define CU1  4.14856118683748331666e-01
+#define CU2 -3.72207876035701323847e-01
+#define CU3  3.18346619901161753674e-01
+#define CU4 -1.10894694282396677476e-01
+#define CU5  3.54783043256182359371e-02
+#define CU6 -2.16637559486879084300e-03
+
+#define CV0 1.06420880400844228286e-01
+#define CV1 5.40397917702171048937e-01
+#define CV2 7.18286544141962662868e-02
+#define CV3 1.26171219808761642112e-01
+#define CV4 1.36370839120290507362e-02
+#define CV5 1.19844998467991074170e-02
+
+#define DU0  1.28379167095512558561e-01
+#define DU1 -3.25042107247001499370e-01
+#define DU2 -2.84817495755985104766e-02
+#define DU3 -5.77027029648944159157e-03
+#define DU4 -2.37630166566501626084e-05
+
+#define DV0  3.97917223959155352819e-01
+#define DV1  6.50222499887672944485e-02
+#define DV2  5.08130628187576562776e-03
+#define DV3  1.32494738004321644526e-04
+#define DV4 -3.96022827877536812320e-06
+
+__attribute__((overloadable)) double
+erfc(double x)
+{
+    long lx = as_long(x);
+    long ax = lx & 0x7fffffffffffffffL;
+    double absx = as_double(ax);
+    int xneg = lx != ax;
+
+    // Poly arg
+    double x2 = x * x;
+    double xm1 = absx - 1.0;
+    double t = 1.0 / x2;
+    t = absx < 1.25 ? xm1 : t;
+    t = absx < 0.84375 ? x2 : t;
+
+
+    // Evaluate rational poly
+    // XXX Need to evaluate if we can grab the 14 coefficients from a
+    // table faster than evaluating 3 pairs of polys
+    double tu, tv, u, v;
+
+    // |x| < 28
+    u = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AU6, AU5), AU4), AU3), AU2), AU1), AU0);
+    v = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AV6, AV5), AV4), AV3), AV2), AV1), AV0);
+
+    tu = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BU7, BU6), BU5), BU4), BU3), BU2), BU1), BU0);
+    tv = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BV7, BV6), BV5), BV4), BV3), BV2), BV1), BV0);
+    u = absx < 0x1.6db6dp+1 ? tu : u;
+    v = absx < 0x1.6db6dp+1 ? tv : v;
+
+    tu = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, CU6, CU5), CU4), CU3), CU2), CU1), CU0);
+    tv = fma(t, fma(t, fma(t, fma(t, fma(t, CV5, CV4), CV3), CV2), CV1), CV0);
+    u = absx < 1.25 ? tu : u;
+    v = absx < 1.25 ? tv : v;
+
+    tu = fma(t, fma(t, fma(t, fma(t, DU4, DU3), DU2), DU1), DU0);
+    tv = fma(t, fma(t, fma(t, fma(t, DV4, DV3), DV2), DV1), DV0);
+    u = absx < 0.84375 ? tu : u;
+    v = absx < 0.84375 ? tv : v;
+
+    v = fma(t, v, 1.0);
+    double q = u / v;
+
+
+    // Evaluate return value
+
+    // |x| < 28
+    double z = as_double(ax & 0xffffffff00000000UL);
+    double ret = exp(-z * z - 0.5625) * exp((z - absx) * (z + absx) + q) / absx;
+    t = 2.0 - ret;
+    ret = xneg ? t : ret;
+
+    const double erx = 8.45062911510467529297e-01;
+    z = erx + q + 1.0;
+    t = 1.0 - erx - q;
+    t = xneg ? z : t;
+    ret = absx < 1.25 ? t : ret;
+
+    // z = 1.0 - fma(x, q, x);
+    // t = 0.5 - fma(x, q, x - 0.5);
+    // t = xneg == 1 | absx < 0.25 ? z : t;
+    t = fma(-x, q, 1.0 - x);
+    ret = absx < 0.84375 ? t : ret;
+
+    ret = x >= 28.0 ? 0.0 : ret;
+    ret = x <= -6.0 ? 2.0 : ret;
+    ret = ax > 0x7ff0000000000000UL ? x : ret;
+
+    return ret;
+}
+

diff --git a/amd-builtins/math64/exp10D.cl b/amd-builtins/math64/exp10D.cl
new file mode 100644
index 0000000..b330b28
--- /dev/null
+++ b/amd-builtins/math64/exp10D.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_EXP10
+#include "expD_base.h"
+

diff --git a/amd-builtins/math64/exp2D.cl b/amd-builtins/math64/exp2D.cl
new file mode 100644
index 0000000..938c594
--- /dev/null
+++ b/amd-builtins/math64/exp2D.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_EXP2
+#include "expD_base.h"
+

diff --git a/amd-builtins/math64/expD.cl b/amd-builtins/math64/expD.cl
new file mode 100644
index 0000000..cf21877
--- /dev/null
+++ b/amd-builtins/math64/expD.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_EXP
+#include "expD_base.h"
+

diff --git a/amd-builtins/math64/expD_base.h b/amd-builtins/math64/expD_base.h
new file mode 100644
index 0000000..6cc2a6d
--- /dev/null
+++ b/amd-builtins/math64/expD_base.h

@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+//   Algorithm:
+//
+//   e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
+//
+//   x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
+//   n = 64*m + j,   0 <= j < 64
+//
+//   e^x = 2^((64*m + j + f)/64)
+//       = (2^m) * (2^(j/64)) * 2^(f/64)
+//       = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
+//
+//   f = x*(64/ln(2)) - n
+//   r = f*(ln(2)/64) = x - n*(ln(2)/64)
+//
+//   e^x = (2^m) * (2^(j/64)) * e^r
+//
+//   (2^(j/64)) is precomputed
+//
+//   e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
+//   e^r = 1 + q
+//
+//   q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
+//
+//   e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
+
+__attribute__((overloadable, always_inline, weak)) double
+#if defined COMPILING_EXP2
+exp2(double x)
+#elif defined COMPILING_EXP10
+exp10(double x)
+#else
+exp(double x)
+#endif
+{
+    USE_TABLE(double2, p_tbl, TWO_TO_JBY64_EP);
+
+#if defined(COMPILING_EXP2)
+    const double X_MAX = 1024.0;
+    const double X_MIN = -1074;
+#elif defined(COMPILING_EXP10)
+    const double X_MAX = 0x1.34413509f79ffp+8; // 1024*ln(2)/ln(10)
+    const double X_MIN = -0x1.434e6420f4374p+8; // -1074*ln(2)/ln(10)
+#else
+    const double X_MAX = 0x1.62e42fefa39efp+9; // 1024*ln(2)
+    const double X_MIN = -0x1.74910d52d3051p+9; // -1075*ln(2)
+#endif
+
+#if defined(COMPILING_EXP2)
+    const double R_64 = 64.0;
+    const double R_1_BY_64 = 1.0 / 64.0;
+    const double R_LN2 = 0x1.62e42fefa39efp-1; // ln(2)
+#elif defined(COMPILING_EXP10)
+    const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7; // 64*ln(10)/ln(2)
+    const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8; // head ln(2)/(64*ln(10))
+    const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37; // tail ln(2)/(64*ln(10))
+    const double R_LN10 = 0x1.26bb1bbb55516p+1; // ln(10)
+#else
+    const double R_64_BY_LOG2 = 0x1.71547652b82fep+6; // 64/ln(2)
+    const double R_LOG2_BY_64_LD = 0x1.62e42fefa0000p-7; // head ln(2)/64
+    const double R_LOG2_BY_64_TL = 0x1.cf79abc9e3b39p-46; // tail ln(2)/64
+#endif
+
+#if defined(COMPILING_EXP2)
+    int n = convert_int(x * R_64);
+#elif defined(COMPILING_EXP10)
+    int n = convert_int(x * R_64_BY_LOG10_2);
+#else
+    int n = convert_int(x * R_64_BY_LOG2);
+#endif
+
+    double dn = (double)n;
+
+    int j = n & 0x3f;
+    int m = n >> 6;
+
+#if defined(COMPILING_EXP2)
+    double r = R_LN2 * fma(-R_1_BY_64, dn, x);
+#elif defined(COMPILING_EXP10)
+    double r = R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
+#else
+    double r = fma(-R_LOG2_BY_64_TL, dn, fma(-R_LOG2_BY_64_LD, dn, x));
+#endif
+
+    // 6 term tail of Taylor expansion of e^r
+    double z2 = r * fma(r,
+	                fma(r,
+		            fma(r,
+			        fma(r,
+			            fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
+			            0x1.5555555555555p-5),
+			        0x1.5555555555555p-3),
+		            0x1.0000000000000p-1),
+		        1.0);
+
+    double2 tv = p_tbl[j];
+    z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
+
+    int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
+
+	int n1 = m >> 2;
+	int n2 = m-n1;
+	double z3= z2 * as_double(((long)n1 + 1023) << 52);
+	z3 *= as_double(((long)n2 + 1023) << 52);
+
+    z2 = ldexp(z2, m);
+    z2 = small_value ? z3: z2;
+
+    z2 = isnan(x) ? x : z2;
+
+    z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
+    z2 = x < X_MIN ? 0.0 : z2;
+
+    return z2;
+}
+

diff --git a/amd-builtins/math64/expD_table.h b/amd-builtins/math64/expD_table.h
new file mode 100644
index 0000000..9909153
--- /dev/null
+++ b/amd-builtins/math64/expD_table.h

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+DECLARE_TABLE(double2, TWO_TO_JBY64_EP, 64,
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.02c9a30000000p+0, 0x1.cef00c1dcdef9p-25),
+    (double2)(0x1.059b0d0000000p+0, 0x1.8ac2ba1d73e2ap-27),
+    (double2)(0x1.0874510000000p+0, 0x1.0eb37901186bep-25),
+    (double2)(0x1.0b55860000000p+0, 0x1.9f3121ec53172p-25),
+    (double2)(0x1.0e3ec30000000p+0, 0x1.69e8d10103a17p-27),
+    (double2)(0x1.11301d0000000p+0, 0x1.25b50a4ebbf1ap-32),
+    (double2)(0x1.1429aa0000000p+0, 0x1.d525bbf668203p-25),
+    (double2)(0x1.172b830000000p+0, 0x1.8faa2f5b9bef9p-25),
+    (double2)(0x1.1a35be0000000p+0, 0x1.6df96ea796d31p-25),
+    (double2)(0x1.1d48730000000p+0, 0x1.68b9aa7805b80p-28),
+    (double2)(0x1.2063b80000000p+0, 0x1.0c519ac771dd6p-25),
+    (double2)(0x1.2387a60000000p+0, 0x1.ceac470cd83f5p-25),
+    (double2)(0x1.26b4560000000p+0, 0x1.789f37495e99cp-26),
+    (double2)(0x1.29e9df0000000p+0, 0x1.47f7b84b09745p-26),
+    (double2)(0x1.2d285a0000000p+0, 0x1.b900c2d002475p-26),
+    (double2)(0x1.306fe00000000p+0, 0x1.4636e2a5bd1abp-25),
+    (double2)(0x1.33c08b0000000p+0, 0x1.320b7fa64e430p-27),
+    (double2)(0x1.371a730000000p+0, 0x1.ceaa72a9c5154p-26),
+    (double2)(0x1.3a7db30000000p+0, 0x1.3967fdba86f24p-26),
+    (double2)(0x1.3dea640000000p+0, 0x1.82468446b6824p-25),
+    (double2)(0x1.4160a20000000p+0, 0x1.f72e29f84325bp-28),
+    (double2)(0x1.44e0860000000p+0, 0x1.8624b40c4dbd0p-30),
+    (double2)(0x1.486a2b0000000p+0, 0x1.704f3404f068ep-26),
+    (double2)(0x1.4bfdad0000000p+0, 0x1.4d8a89c750e5ep-26),
+    (double2)(0x1.4f9b270000000p+0, 0x1.a74b29ab4cf62p-26),
+    (double2)(0x1.5342b50000000p+0, 0x1.a753e077c2a0fp-26),
+    (double2)(0x1.56f4730000000p+0, 0x1.ad49f699bb2c0p-26),
+    (double2)(0x1.5ab07d0000000p+0, 0x1.a90a852b19260p-25),
+    (double2)(0x1.5e76f10000000p+0, 0x1.6b48521ba6f93p-26),
+    (double2)(0x1.6247eb0000000p+0, 0x1.d2ac258f87d03p-31),
+    (double2)(0x1.6623880000000p+0, 0x1.2a91124893ecfp-27),
+    (double2)(0x1.6a09e60000000p+0, 0x1.9fcef32422cbep-26),
+    (double2)(0x1.6dfb230000000p+0, 0x1.8ca345de441c5p-25),
+    (double2)(0x1.71f75e0000000p+0, 0x1.1d8bee7ba46e1p-25),
+    (double2)(0x1.75feb50000000p+0, 0x1.9099f22fdba6ap-26),
+    (double2)(0x1.7a11470000000p+0, 0x1.f580c36bea881p-27),
+    (double2)(0x1.7e2f330000000p+0, 0x1.b3d398841740ap-26),
+    (double2)(0x1.8258990000000p+0, 0x1.2999c25159f11p-25),
+    (double2)(0x1.868d990000000p+0, 0x1.68925d901c83bp-25),
+    (double2)(0x1.8ace540000000p+0, 0x1.15506dadd3e2ap-27),
+    (double2)(0x1.8f1ae90000000p+0, 0x1.22aee6c57304ep-25),
+    (double2)(0x1.93737b0000000p+0, 0x1.9b8bc9e8a0387p-29),
+    (double2)(0x1.97d8290000000p+0, 0x1.fbc9c9f173d24p-25),
+    (double2)(0x1.9c49180000000p+0, 0x1.51f8480e3e235p-27),
+    (double2)(0x1.a0c6670000000p+0, 0x1.6bbcac96535b5p-25),
+    (double2)(0x1.a5503b0000000p+0, 0x1.1f12ae45a1224p-27),
+    (double2)(0x1.a9e6b50000000p+0, 0x1.5e7f6fd0fac90p-26),
+    (double2)(0x1.ae89f90000000p+0, 0x1.2b5a75abd0e69p-25),
+    (double2)(0x1.b33a2b0000000p+0, 0x1.09e2bf5ed7fa1p-25),
+    (double2)(0x1.b7f76f0000000p+0, 0x1.7daf237553d84p-27),
+    (double2)(0x1.bcc1e90000000p+0, 0x1.2f074891ee83dp-30),
+    (double2)(0x1.c199bd0000000p+0, 0x1.b0aa538444196p-25),
+    (double2)(0x1.c67f120000000p+0, 0x1.cafa29694426fp-25),
+    (double2)(0x1.cb720d0000000p+0, 0x1.9df20d22a0797p-25),
+    (double2)(0x1.d072d40000000p+0, 0x1.40f12f71a1e45p-25),
+    (double2)(0x1.d5818d0000000p+0, 0x1.9f7490e4bb40bp-25),
+    (double2)(0x1.da9e600000000p+0, 0x1.ed9942b84600dp-27),
+    (double2)(0x1.dfc9730000000p+0, 0x1.bdcdaf5cb4656p-27),
+    (double2)(0x1.e502ee0000000p+0, 0x1.e2cffd89cf44cp-26),
+    (double2)(0x1.ea4afa0000000p+0, 0x1.52486cc2c7b9dp-27),
+    (double2)(0x1.efa1be0000000p+0, 0x1.cc2b44eee3fa4p-25),
+    (double2)(0x1.f507650000000p+0, 0x1.6dc8a80ce9f09p-25),
+    (double2)(0x1.fa7c180000000p+0, 0x1.9e90d82e90a7ep-28),
+)
+

diff --git a/amd-builtins/math64/expm1D.cl b/amd-builtins/math64/expm1D.cl
new file mode 100644
index 0000000..a61384d
--- /dev/null
+++ b/amd-builtins/math64/expm1D.cl

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "math64.h"
+
+__attribute__((overloadable)) double
+expm1(double x)
+{
+    USE_TABLE(double2, p_tbl, TWO_TO_JBY64_EP);
+
+    const double max_expm1_arg = 709.8;
+    const double min_expm1_arg = -37.42994775023704;
+    const double log_OnePlus_OneByFour = 0.22314355131420976;   //0x3FCC8FF7C79A9A22 = log(1+1/4)
+    const double log_OneMinus_OneByFour = -0.28768207245178096; //0xBFD269621134DB93 = log(1-1/4)
+    const double sixtyfour_by_lnof2 = 92.33248261689366;        //0x40571547652b82fe
+    const double lnof2_by_64_head = 0.010830424696223417;       //0x3f862e42fefa0000
+    const double lnof2_by_64_tail = 2.5728046223276688e-14;     //0x3d1cf79abc9e3b39
+
+
+    // First, assume log(1-1/4) < x < log(1+1/4) i.e  -0.28768 < x < 0.22314
+    double u = as_double(as_ulong(x) & 0xffffffffff000000UL);
+    double v = x - u;
+    double y = u * u * 0.5;
+    double z = v * (x + u) * 0.5;
+
+    double q = fma(x,
+	           fma(x,
+		       fma(x,
+			   fma(x,
+			       fma(x,
+				   fma(x,
+				       fma(x,
+					   fma(x,2.4360682937111612e-8, 2.7582184028154370e-7),
+					   2.7558212415361945e-6),
+				       2.4801576918453420e-5),
+				   1.9841269447671544e-4),
+			       1.3888888890687830e-3),
+			   8.3333333334012270e-3),
+		       4.1666666666665560e-2),
+		   1.6666666666666632e-1);
+    q *= x * x * x;
+
+    double z1g = (u + y) + (q + (v + z));
+    double z1 = x + (y + (q + z));
+    z1 = y >= 0x1.0p-7 ? z1g : z1;
+
+    // Now assume outside interval around 0
+    int n = (int)(x * sixtyfour_by_lnof2);
+    int j = n & 0x3f;
+    int m = n >> 6;
+
+    double2 tv = p_tbl[j];
+    double f1 = tv.s0;
+    double f2 = tv.s1;
+    double f = f1 + f2;
+
+    double dn = -n;
+    double r = fma(dn, lnof2_by_64_tail, fma(dn, lnof2_by_64_head, x));
+
+    q = fma(r,
+	    fma(r,
+		fma(r,
+		    fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+		    4.16666666662260795726e-02),
+		1.66666666665260878863e-01),
+	     5.00000000000000008883e-01);
+    q = fma(r*r, q, r);
+
+    double twopm = as_double((long)(m + EXPBIAS_DP64) << EXPSHIFTBITS_DP64);
+    double twopmm = as_double((long)(EXPBIAS_DP64 - m) << EXPSHIFTBITS_DP64);
+
+    // Computations for m > 52, including where result is close to Inf
+    ulong uval = as_ulong(0x1.0p+1023 * (f1 + (f * q + (f2))));
+    int e = (int)(uval >> EXPSHIFTBITS_DP64) + 1;
+
+    double zme1024 = as_double(((long)e << EXPSHIFTBITS_DP64) | (uval & MANTBITS_DP64));
+    zme1024 = e == 2047 ? as_double(PINFBITPATT_DP64) : zme1024;
+
+    double zmg52 = twopm * (f1 + fma(f, q, f2 - twopmm));
+    zmg52 = m == 1024 ? zme1024 : zmg52;
+
+    // For m < 53
+    double zml53 = twopm * ((f1 - twopmm) + fma(f1, q, f2*(1.0 + q)));
+
+    // For m < -7
+    double zmln7 = fma(twopm,  f1 + fma(f, q, f2), -1.0);
+
+    z = m < 53 ? zml53 : zmg52;
+    z = m < -7 ? zmln7 : z;
+    z = x > log_OneMinus_OneByFour & x < log_OnePlus_OneByFour ? z1 : z;
+    z = x > max_expm1_arg ? as_double(PINFBITPATT_DP64) : z;
+    z = x < min_expm1_arg ? -1.0 : z;
+
+    return z;
+}
+

diff --git a/amd-builtins/math64/fabsD.cl b/amd-builtins/math64/fabsD.cl
new file mode 100644
index 0000000..eded5e3
--- /dev/null
+++ b/amd-builtins/math64/fabsD.cl

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+#define G(N) \
+__attribute__((overloadable, always_inline)) double##N \
+fabs(double##N x) \
+{ \
+    double##N ret; \
+    ret.lo = fabs(x.lo); \
+    ret.hi = fabs(x.hi); \
+    return ret; \
+}
+
+G(16)
+G(8)
+G(4)
+
+__attribute__((overloadable, always_inline)) double3
+fabs(double3 x)
+{
+    double3 ret;
+    ret.s01 = fabs(x.s01);
+    ret.s2 = fabs(x.s2);
+    return ret;
+}
+
+G(2)
+
+__attribute__((overloadable, always_inline)) double
+fabs(double x)
+{
+    return __amdil_fabs_f64(x);
+}
+

diff --git a/amd-builtins/math64/fdimD.cl b/amd-builtins/math64/fdimD.cl
new file mode 100644
index 0000000..194e99c
--- /dev/null
+++ b/amd-builtins/math64/fdimD.cl

@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) double
+fdim(double x, double y)
+{
+    long n = -(isnan(x) | isnan(y)) & QNANBITPATT_DP64;
+    long r = -(x > y) & as_long(x - y);
+    return as_double(n | r);
+}
+

diff --git a/amd-builtins/math64/floorD.cl b/amd-builtins/math64/floorD.cl
new file mode 100644
index 0000000..2ac88fd
--- /dev/null
+++ b/amd-builtins/math64/floorD.cl

@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__ ((overloadable, always_inline)) double
+floor(double x)
+{
+    return __amdil_round_neginf_f64(x);
+}
+

diff --git a/amd-builtins/math64/fmaD.cl b/amd-builtins/math64/fmaD.cl
new file mode 100644
index 0000000..8d7b8c8
--- /dev/null
+++ b/amd-builtins/math64/fmaD.cl

@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) double
+fma(double x, double y, double z)
+{
+    return __amdil_fma_f64(x, y, z);
+}
+

diff --git a/amd-builtins/math64/fmaxD.cl b/amd-builtins/math64/fmaxD.cl
new file mode 100644
index 0000000..682317a
--- /dev/null
+++ b/amd-builtins/math64/fmaxD.cl

@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+extern __attribute__((pure)) double __hsail_max_f64(double,double);
+
+__attribute__((overloadable, always_inline)) double
+fmax(double x, double y)
+{
+    return __hsail_max_f64(x, y);
+}
+

diff --git a/amd-builtins/math64/fminD.cl b/amd-builtins/math64/fminD.cl
new file mode 100644
index 0000000..30f13bb
--- /dev/null
+++ b/amd-builtins/math64/fminD.cl

@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+extern __attribute__((pure)) double __hsail_min_f64(double,double);
+
+__attribute__((overloadable, always_inline)) double
+fmin(double x, double y)
+{
+    return __hsail_min_f64(x, y);
+}
+

diff --git a/amd-builtins/math64/fmodD.cl b/amd-builtins/math64/fmodD.cl
new file mode 100644
index 0000000..8906e14
--- /dev/null
+++ b/amd-builtins/math64/fmodD.cl

@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+#define COMPILING_FMOD
+#include "remainderD.h"
+

diff --git a/amd-builtins/math64/fractD.cl b/amd-builtins/math64/fractD.cl
new file mode 100644
index 0000000..2d8ea83
--- /dev/null
+++ b/amd-builtins/math64/fractD.cl

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) double
+fract(double x, double *ip)
+{
+    long j = as_long(x);
+    long z = j & 0x8000000000000000L;
+    long a = j ^ z;
+    long n = a == 0x7ff0000000000000L ? z : j;
+    long s = a != 0L & z != 0L ? 0xbff0000000000000L : z;
+    int e = ((int)(j >> 52) & 0x7ff) - 1023;
+    long m = 0x000fffffffffffffL >> e;
+    long k = 0x0010000000000000L >> e;
+    k = (j & m) != 0L & z != 0L ? k : 0L;
+    k += j;
+    k &= ~m;
+    k = e < 0 ? s : k;
+    k = e > 51 ? j : k;
+    double i = as_double(k);
+    long d = as_long(x - i);
+    d -= d == 0x3ff0000000000000L;
+    d = a ? d : z;
+    d = e == 1024 ? n : d;
+    *ip = i;
+    return as_double(d);
+}
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline)) double
+fract(double x, __local double *ip)
+{
+    double i;
+    double f = fract(x, &i);
+    *ip = i;
+    return f;
+}
+
+__attribute__((overloadable, always_inline)) double
+fract(double x, __global double *ip)
+{
+    double i;
+    double f = fract(x, &i);
+    *ip = i;
+    return f;
+}
+#endif

diff --git a/amd-builtins/math64/frexpD.cl b/amd-builtins/math64/frexpD.cl
new file mode 100644
index 0000000..280a1c4
--- /dev/null
+++ b/amd-builtins/math64/frexpD.cl

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline, weak)) double
+frexp(double x, int *ep)
+{
+    long i = as_long(x);
+    long ai = i & 0x7fffffffffffffffL;
+    int d = ai > 0 & ai < 0x0010000000000000L;
+    // scale subnormal by 2^54 without multiplying
+    double s = as_double(ai | 0x0370000000000000L) - 0x1.0p-968;
+    ai = d ? as_long(s) : ai;
+    int e = (int)(ai >> 52) - 1022 - (d ? 54 : 0);
+    int t = ai == 0 | e == 1025;
+    i = (i & 0x8000000000000000L) | 0x3fe0000000000000L | (ai & 0x000fffffffffffffL);
+    *ep = t ? 0 : e;
+    return t ? x : as_double(i);
+}
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double
+frexp(double x, __local int *ep)
+{
+    int e;
+    double f = frexp(x, &e);
+    *ep = e;
+    return f;
+}
+
+__attribute__((overloadable, always_inline, weak)) double
+frexp(double x, __global int *ep)
+{
+    int e;
+    double f = frexp(x, &e);
+    *ep = e;
+    return f;
+}
+#endif

diff --git a/amd-builtins/math64/hypotD.cl b/amd-builtins/math64/hypotD.cl
new file mode 100644
index 0000000..87c2df6
--- /dev/null
+++ b/amd-builtins/math64/hypotD.cl

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+hypot(double x, double y)
+{
+    ulong ux = as_ulong(x) & ~SIGNBIT_DP64;
+    int xexp = ux >> EXPSHIFTBITS_DP64;
+    x = as_double(ux);
+
+    ulong uy = as_ulong(y) & ~SIGNBIT_DP64;
+    int yexp = uy >> EXPSHIFTBITS_DP64;
+    y = as_double(uy);
+
+    int c = xexp > EXPBIAS_DP64 + 500 | yexp > EXPBIAS_DP64 + 500;
+    double preadjust = c ? 0x1.0p-600 : 1.0;
+    double postadjust = c ? 0x1.0p+600 : 1.0;
+
+    c = xexp < EXPBIAS_DP64 - 500 | yexp < EXPBIAS_DP64 - 500;
+    preadjust = c ? 0x1.0p+600 : preadjust;
+    postadjust = c ? 0x1.0p-600 : postadjust;
+
+    double ax = x * preadjust;
+    double ay = y * preadjust;
+
+    // The post adjust may overflow, but this can't be avoided in any case
+    double r = sqrt(fma(ax, ax, ay*ay)) * postadjust;
+
+    // If the difference in exponents between x and y is large
+    double s = x + y;
+    c = abs(xexp - yexp) > MANTLENGTH_DP64 + 1;
+    r = c ? s : r;
+
+    // Check for NaN
+    //c = x != x | y != y;
+    c = isnan(x) | isnan(y);
+    r = c ? as_double(QNANBITPATT_DP64) : r;
+
+    // If either is Inf, we must return Inf
+    c = x == as_double(PINFBITPATT_DP64) | y == as_double(PINFBITPATT_DP64);
+    r = c ? as_double(PINFBITPATT_DP64) : r;
+
+    return r;
+}
+

diff --git a/amd-builtins/math64/ilogbD.cl b/amd-builtins/math64/ilogbD.cl
new file mode 100644
index 0000000..3a66936
--- /dev/null
+++ b/amd-builtins/math64/ilogbD.cl

@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) int
+ilogb(double x)
+{
+    ulong ux = as_ulong(x);
+    ulong ax = ux & ~SIGNBIT_DP64;
+    int r = (int)(ax >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
+    int rs = -1011 - (int)clz(ax & MANTBITS_DP64);
+    r = ax < 0x0010000000000000UL ? rs : r;
+    r = ax > 0x7ff0000000000000UL | ax == 0UL ? 0x80000000 : r;
+    r = ax == 0x7ff0000000000000UL ? 0x7fffffff : r;
+    return r;
+}
+

diff --git a/amd-builtins/math64/ldexpD.cl b/amd-builtins/math64/ldexpD.cl
new file mode 100644
index 0000000..58c549d
--- /dev/null
+++ b/amd-builtins/math64/ldexpD.cl

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline, weak)) double
+ldexp(double x, int n)
+{
+	long l = as_ulong(x);
+	int e = (l >> 52) & 0x7ff;
+	long s = l & 0x8000000000000000;
+
+	ulong ux = as_ulong(x * 0x1.0p+53);
+	int de = ((int)(ux >> 52) & 0x7ff) - 53;
+	int c = e == 0;
+	e = c ? de: e;
+
+	ux = c ? ux : l;
+
+	int v = e + n;
+	v = clamp(v, -0x7ff, 0x7ff);
+
+	ux &= ~EXPBITS_DP64;
+
+	double mr = as_double(ux | ((ulong)(v+53) << 52));
+	mr = mr * 0x1.0p-53;
+
+	mr = v > 0  ? as_double(ux | ((ulong)v << 52)) : mr;
+
+	mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64)  : mr;
+	mr = v < -53 ? as_double(s) : mr;
+
+	mr  = (n == 0 | isinf(x) | x == 0 ) ? x : mr;
+	return mr;
+}

diff --git a/amd-builtins/math64/lgammaD.cl b/amd-builtins/math64/lgammaD.cl
new file mode 100644
index 0000000..eb5119a
--- /dev/null
+++ b/amd-builtins/math64/lgammaD.cl

@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+// ====================================================
+// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+//
+// Developed at SunPro, a Sun Microsystems, Inc. business.
+// Permission to use, copy, modify, and distribute this
+// software is freely granted, provided that this notice
+// is preserved.
+// ====================================================
+
+// lgamma_r(x, i)
+// Reentrant version of the logarithm of the Gamma function
+// with user provide pointer for the sign of Gamma(x).
+//
+// Method:
+//   1. Argument Reduction for 0 < x <= 8
+//      Since gamma(1+s)=s*gamma(s), for x in [0,8], we may
+//      reduce x to a number in [1.5,2.5] by
+//              lgamma(1+s) = log(s) + lgamma(s)
+//      for example,
+//              lgamma(7.3) = log(6.3) + lgamma(6.3)
+//                          = log(6.3*5.3) + lgamma(5.3)
+//                          = log(6.3*5.3*4.3*3.3*2.3) + lgamma(2.3)
+//   2. Polynomial approximation of lgamma around its
+//      minimun ymin=1.461632144968362245 to maintain monotonicity.
+//      On [ymin-0.23, ymin+0.27] (i.e., [1.23164,1.73163]), use
+//              Let z = x-ymin;
+//              lgamma(x) = -1.214862905358496078218 + z^2*poly(z)
+//      where
+//              poly(z) is a 14 degree polynomial.
+//   2. Rational approximation in the primary interval [2,3]
+//      We use the following approximation:
+//              s = x-2.0;
+//              lgamma(x) = 0.5*s + s*P(s)/Q(s)
+//      with accuracy
+//              |P/Q - (lgamma(x)-0.5s)| < 2**-61.71
+//      Our algorithms are based on the following observation
+//
+//                             zeta(2)-1    2    zeta(3)-1    3
+// lgamma(2+s) = s*(1-Euler) + --------- * s  -  --------- * s  + ...
+//                                 2                 3
+//
+//      where Euler = 0.5771... is the Euler constant, which is very
+//      close to 0.5.
+//
+//   3. For x>=8, we have
+//      lgamma(x)~(x-0.5)log(x)-x+0.5*log(2pi)+1/(12x)-1/(360x**3)+....
+//      (better formula:
+//         lgamma(x)~(x-0.5)*(log(x)-1)-.5*(log(2pi)-1) + ...)
+//      Let z = 1/x, then we approximation
+//              f(z) = lgamma(x) - (x-0.5)(log(x)-1)
+//      by
+//                                  3       5             11
+//              w = w0 + w1*z + w2*z  + w3*z  + ... + w6*z
+//      where
+//              |w - f(z)| < 2**-58.74
+//
+//   4. For negative x, since (G is gamma function)
+//              -x*G(-x)*G(x) = pi/sin(pi*x),
+//      we have
+//              G(x) = pi/(sin(pi*x)*(-x)*G(-x))
+//      since G(-x) is positive, sign(G(x)) = sign(sin(pi*x)) for x<0
+//      Hence, for x<0, signgam = sign(sin(pi*x)) and
+//              lgamma(x) = log(|Gamma(x)|)
+//                        = log(pi/(|x*sin(pi*x)|)) - lgamma(-x);
+//      Note: one should avoid compute pi*(-x) directly in the
+//            computation of sin(pi*(-x)).
+//
+//   5. Special Cases
+//              lgamma(2+s) ~ s*(1-Euler) for tiny s
+//              lgamma(1)=lgamma(2)=0
+//              lgamma(x) ~ -log(x) for tiny x
+//              lgamma(0) = lgamma(inf) = inf
+//              lgamma(-integer) = +-inf
+//
+
+#define pi 3.14159265358979311600e+00	/* 0x400921FB, 0x54442D18 */
+
+#define a0 7.72156649015328655494e-02	/* 0x3FB3C467, 0xE37DB0C8 */
+#define a1 3.22467033424113591611e-01	/* 0x3FD4A34C, 0xC4A60FAD */
+#define a2 6.73523010531292681824e-02	/* 0x3FB13E00, 0x1A5562A7 */
+#define a3 2.05808084325167332806e-02	/* 0x3F951322, 0xAC92547B */
+#define a4 7.38555086081402883957e-03	/* 0x3F7E404F, 0xB68FEFE8 */
+#define a5 2.89051383673415629091e-03	/* 0x3F67ADD8, 0xCCB7926B */
+#define a6 1.19270763183362067845e-03	/* 0x3F538A94, 0x116F3F5D */
+#define a7 5.10069792153511336608e-04	/* 0x3F40B6C6, 0x89B99C00 */
+#define a8 2.20862790713908385557e-04	/* 0x3F2CF2EC, 0xED10E54D */
+#define a9 1.08011567247583939954e-04	/* 0x3F1C5088, 0x987DFB07 */
+#define a10 2.52144565451257326939e-05	/* 0x3EFA7074, 0x428CFA52 */
+#define a11 4.48640949618915160150e-05	/* 0x3F07858E, 0x90A45837 */
+
+#define tc 1.46163214496836224576e+00	/* 0x3FF762D8, 0x6356BE3F */
+#define tf -1.21486290535849611461e-01	/* 0xBFBF19B9, 0xBCC38A42 */
+#define tt -3.63867699703950536541e-18	/* 0xBC50C7CA, 0xA48A971F */
+
+#define t0 4.83836122723810047042e-01	/* 0x3FDEF72B, 0xC8EE38A2 */
+#define t1 -1.47587722994593911752e-01	/* 0xBFC2E427, 0x8DC6C509 */
+#define t2 6.46249402391333854778e-02	/* 0x3FB08B42, 0x94D5419B */
+#define t3 -3.27885410759859649565e-02	/* 0xBFA0C9A8, 0xDF35B713 */
+#define t4 1.79706750811820387126e-02	/* 0x3F9266E7, 0x970AF9EC */
+#define t5 -1.03142241298341437450e-02	/* 0xBF851F9F, 0xBA91EC6A */
+#define t6 6.10053870246291332635e-03	/* 0x3F78FCE0, 0xE370E344 */
+#define t7 -3.68452016781138256760e-03	/* 0xBF6E2EFF, 0xB3E914D7 */
+#define t8 2.25964780900612472250e-03	/* 0x3F6282D3, 0x2E15C915 */
+#define t9 -1.40346469989232843813e-03	/* 0xBF56FE8E, 0xBF2D1AF1 */
+#define t10 8.81081882437654011382e-04	/* 0x3F4CDF0C, 0xEF61A8E9 */
+#define t11 -5.38595305356740546715e-04	/* 0xBF41A610, 0x9C73E0EC */
+#define t12 3.15632070903625950361e-04	/* 0x3F34AF6D, 0x6C0EBBF7 */
+#define t13 -3.12754168375120860518e-04	/* 0xBF347F24, 0xECC38C38 */
+#define t14 3.35529192635519073543e-04	/* 0x3F35FD3E, 0xE8C2D3F4 */
+
+#define u0 -7.72156649015328655494e-02	/* 0xBFB3C467, 0xE37DB0C8 */
+#define u1 6.32827064025093366517e-01	/* 0x3FE4401E, 0x8B005DFF */
+#define u2 1.45492250137234768737e+00	/* 0x3FF7475C, 0xD119BD6F */
+#define u3 9.77717527963372745603e-01	/* 0x3FEF4976, 0x44EA8450 */
+#define u4 2.28963728064692451092e-01	/* 0x3FCD4EAE, 0xF6010924 */
+#define u5 1.33810918536787660377e-02	/* 0x3F8B678B, 0xBF2BAB09 */
+
+#define v1 2.45597793713041134822e+00	/* 0x4003A5D7, 0xC2BD619C */
+#define v2 2.12848976379893395361e+00	/* 0x40010725, 0xA42B18F5 */
+#define v3 7.69285150456672783825e-01	/* 0x3FE89DFB, 0xE45050AF */
+#define v4 1.04222645593369134254e-01	/* 0x3FBAAE55, 0xD6537C88 */
+#define v5 3.21709242282423911810e-03	/* 0x3F6A5ABB, 0x57D0CF61 */
+
+#define s0 -7.72156649015328655494e-02	/* 0xBFB3C467, 0xE37DB0C8 */
+#define s1 2.14982415960608852501e-01	/* 0x3FCB848B, 0x36E20878 */
+#define s2 3.25778796408930981787e-01	/* 0x3FD4D98F, 0x4F139F59 */
+#define s3 1.46350472652464452805e-01	/* 0x3FC2BB9C, 0xBEE5F2F7 */
+#define s4 2.66422703033638609560e-02	/* 0x3F9B481C, 0x7E939961 */
+#define s5 1.84028451407337715652e-03	/* 0x3F5E26B6, 0x7368F239 */
+#define s6 3.19475326584100867617e-05	/* 0x3F00BFEC, 0xDD17E945 */
+
+#define r1 1.39200533467621045958e+00	/* 0x3FF645A7, 0x62C4AB74 */
+#define r2 7.21935547567138069525e-01	/* 0x3FE71A18, 0x93D3DCDC */
+#define r3 1.71933865632803078993e-01	/* 0x3FC601ED, 0xCCFBDF27 */
+#define r4 1.86459191715652901344e-02	/* 0x3F9317EA, 0x742ED475 */
+#define r5 7.77942496381893596434e-04	/* 0x3F497DDA, 0xCA41A95B */
+#define r6 7.32668430744625636189e-06	/* 0x3EDEBAF7, 0xA5B38140 */
+
+#define w0 4.18938533204672725052e-01	/* 0x3FDACFE3, 0x90C97D69 */
+#define w1 8.33333333333329678849e-02	/* 0x3FB55555, 0x5555553B */
+#define w2 -2.77777777728775536470e-03	/* 0xBF66C16C, 0x16B02E5C */
+#define w3 7.93650558643019558500e-04	/* 0x3F4A019F, 0x98CF38B6 */
+#define w4 -5.95187557450339963135e-04	/* 0xBF4380CB, 0x8C0FE741 */
+#define w5 8.36339918996282139126e-04	/* 0x3F4B67BA, 0x4CDAD5D1 */
+#define w6 -1.63092934096575273989e-03	/* 0xBF5AB89D, 0x0B9E43E4 */
+
+__attribute__ ((overloadable, always_inline)) double
+lgamma_r(double x, int *ip)
+{
+    ulong ux = as_ulong(x);
+    ulong ax = ux & EXSIGNBIT_DP64;
+    double absx = as_double(ax);
+
+    if (ax >= 0x7ff0000000000000UL) {
+        // +-Inf, NaN
+	*ip = 1;
+	return absx;
+    }
+
+    if (absx < 0x1.0p-70) {
+	*ip = ax == ux ? 1 : -1;
+	return -log(absx);
+    }
+
+    // Handle rest of range
+    double r;
+
+    if (absx < 2.0) {
+	int i = 0;
+	double y = 2.0 - absx;
+
+	int c = absx < 0x1.bb4c3p+0;
+	double t = absx - tc;
+	i = c ? 1 : i;
+	y = c ? t : y;
+
+	c = absx < 0x1.3b4c4p+0;
+	t = absx - 1.0;
+	i = c ? 2 : i;
+	y = c ? t : y;
+
+	c = absx <= 0x1.cccccp-1;
+	t = -log(absx);
+	r = c ? t : 0.0;
+	t = 1.0 - absx;
+	i = c ? 0 : i;
+	y = c ? t : y;
+
+	c = absx < 0x1.76944p-1;
+	t = absx - (tc - 1.0);
+	i = c ? 1 : i;
+	y = c ? t : y;
+
+	c = absx < 0x1.da661p-3;
+	i = c ? 2 : i;
+	y = c ? absx : y;
+
+	double p, q;
+
+	switch (i) {
+	case 0:
+	    p = fma(y, fma(y, fma(y, fma(y, a11, a10), a9), a8), a7);
+	    p = fma(y, fma(y, fma(y, fma(y, p, a6), a5), a4), a3);
+	    p = fma(y, fma(y, fma(y, p, a2), a1), a0);
+	    r = fma(y, p - 0.5, r);
+	    break;
+	case 1:
+	    p = fma(y, fma(y, fma(y, fma(y, t14, t13), t12), t11), t10);
+	    p = fma(y, fma(y, fma(y, fma(y, fma(y, p, t9), t8), t7), t6), t5);
+	    p = fma(y, fma(y, fma(y, fma(y, fma(y, p, t4), t3), t2), t1), t0);
+	    p = fma(y*y, p, -tt);
+	    r += (tf + p);
+	    break;
+	case 2:
+	    p = y*fma(y, fma(y, fma(y, fma(y, fma(y, u5, u4), u3), u2), u1), u0);
+	    q = fma(y, fma(y, fma(y, fma(y, fma(y, v5, v4), v3), v2), v1), 1.0);
+	    r += fma(-0.5, y, p/q);
+	}
+    } else if (absx < 8.0) {
+	int i = absx;
+	double y = absx - (double)i;
+	double p = y*fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, s6, s5), s4), s3), s2), s1), s0);
+	double q = fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, r6, r5), r4), r3), r2), r1), 1.0);
+	r = fma(0.5, y, p/q);
+	double z = 1.0;
+	// lgamma(1+s) = log(s) + lgamma(s)
+	double y6 = y + 6.0;
+	double y5 = y + 5.0;
+	double y4 = y + 4.0;
+	double y3 = y + 3.0;
+	double y2 = y + 2.0;
+	z *= i > 6 ? y6 : 1.0;
+	z *= i > 5 ? y5 : 1.0;
+	z *= i > 4 ? y4 : 1.0;
+	z *= i > 3 ? y3 : 1.0;
+	z *= i > 2 ? y2 : 1.0;
+        r += log(z);
+    } else {
+	double z = 1.0 / absx;
+	double z2 = z * z;
+	double w = fma(z, fma(z2, fma(z2, fma(z2, fma(z2, fma(z2, w6, w5), w4), w3), w2), w1), w0);
+	r = (absx - 0.5) * (log(absx) - 1.0) + w;
+    }
+
+    if (x < 0.0) {
+	double t = sinpi(x);
+	r = log(pi / fabs(t * x)) - r;
+	r = t == 0.0 ? as_double(PINFBITPATT_DP64) : r;
+	*ip = t < 0.0 ? -1 : 1;
+    } else
+	*ip = 1;
+
+    return r;
+}
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__ ((overloadable, always_inline)) double
+lgamma_r(double x, __local int *ip)
+{
+    int i;
+    double ret = lgamma_r(x, &i);
+    *ip = i;
+    return ret;
+}
+
+__attribute__ ((overloadable, always_inline)) double
+lgamma_r(double x, __global int *ip)
+{
+    int i;
+    double ret = lgamma_r(x, &i);
+    *ip = i;
+    return ret;
+}
+#endif
+
+__attribute__ ((overloadable, always_inline)) double
+lgamma(double x)
+{
+    int i;
+    return lgamma_r(x, &i);
+}
+

diff --git a/amd-builtins/math64/log10D.cl b/amd-builtins/math64/log10D.cl
new file mode 100644
index 0000000..df53cd8
--- /dev/null
+++ b/amd-builtins/math64/log10D.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_LOG10
+#include "logD_base.h"
+

diff --git a/amd-builtins/math64/log1pD.cl b/amd-builtins/math64/log1pD.cl
new file mode 100644
index 0000000..b1aaf5d
--- /dev/null
+++ b/amd-builtins/math64/log1pD.cl

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+log1p(double x)
+{
+    USE_TABLE(double2, p_tbl, LN_TBL);
+
+    // Computes natural log(1+x). Algorithm based on:
+    // Ping-Tak Peter Tang
+    // "Table-driven implementation of the logarithm function in IEEE
+    // floating-point arithmetic"
+    // ACM Transactions on Mathematical Software (TOMS)
+    // Volume 16, Issue 4 (December 1990)
+    // Note that we use a lookup table of size 64 rather than 128,
+    // and compensate by having extra terms in the minimax polynomial
+    // for the kernel approximation.
+
+    // Process Inside the threshold now
+    ulong ux = as_ulong(1.0 + x);
+    int xexp = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64;
+    double f = as_double(ONEEXPBITS_DP64 | (ux & MANTBITS_DP64));
+
+    int j = as_int2(ux).hi >> 13;
+    j = ((0x80 | (j & 0x7e)) >> 1) + (j & 0x1);
+    double f1 = (double)j * 0x1.0p-6;
+    j -= 64;
+
+    double f2temp = f - f1;
+    double m2 = as_double(convert_ulong(0x3ff - xexp) << EXPSHIFTBITS_DP64);
+    double f2l = fma(m2, x, m2 - f1);
+    double f2g = fma(m2, x, -f1) + m2;
+    double f2 = xexp <= MANTLENGTH_DP64-1 ? f2l : f2g;
+    f2 = (xexp <= -2) | (xexp >= MANTLENGTH_DP64+8) ? f2temp : f2;
+
+    double2 tv = p_tbl[j];
+    double z1 = tv.s0;
+    double q = tv.s1;
+
+    double u = MATH_DIVIDE(f2, fma(0.5, f2, f1));
+    double v = u * u;
+
+    double poly = v * fma(v,
+                          fma(v, 2.23219810758559851206e-03, 1.24999999978138668903e-02),
+                          8.33333333333333593622e-02);
+
+    // log2_lead and log2_tail sum to an extra-precise version of log(2)
+    const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */
+    const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */
+
+    double z2 = q + fma(u, poly, u);
+    double dxexp = (double)xexp;
+    double r1 = fma(dxexp, log2_lead, z1);
+    double r2 = fma(dxexp, log2_tail, z2);
+    double result1 = r1 + r2;
+
+    // Process Outside the threshold now
+    double r = x;
+    u = r / (2.0 + r);
+    double correction = r * u;
+    u = u + u;
+    v = u * u;
+    r1 = r;
+
+    poly = fma(v,
+               fma(v,
+                   fma(v, 4.34887777707614552256e-04, 2.23213998791944806202e-03),
+                   1.25000000037717509602e-02),
+               8.33333333333317923934e-02);
+
+    r2 = fma(u*v, poly, -correction);
+
+    // The values exp(-1/16)-1 and exp(1/16)-1
+    const double log1p_thresh1 = -0x1.f0540438fd5c3p-5;
+    const double log1p_thresh2 =  0x1.082b577d34ed8p-4;
+    double result2 = r1 + r2;
+    result2 = x < log1p_thresh1 | x > log1p_thresh2 ? result1 : result2;
+
+    result2 = isinf(x) ? x : result2;
+    result2 = x < -1.0 ? as_double(QNANBITPATT_DP64) : result2;
+    result2 = x == -1.0 ? as_double(NINFBITPATT_DP64) : result2;
+    return result2;
+}
+

diff --git a/amd-builtins/math64/log2D.cl b/amd-builtins/math64/log2D.cl
new file mode 100644
index 0000000..86010f6
--- /dev/null
+++ b/amd-builtins/math64/log2D.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_LOG2
+#include "logD_base.h"
+

diff --git a/amd-builtins/math64/logD.cl b/amd-builtins/math64/logD.cl
new file mode 100644
index 0000000..0cd4763
--- /dev/null
+++ b/amd-builtins/math64/logD.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_LOG
+#include "logD_base.h"
+

diff --git a/amd-builtins/math64/logD_base.h b/amd-builtins/math64/logD_base.h
new file mode 100644
index 0000000..a07e902
--- /dev/null
+++ b/amd-builtins/math64/logD_base.h

@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+//   Algorithm:
+//
+//   Based on:
+//   Ping-Tak Peter Tang
+//   "Table-driven implementation of the logarithm function in IEEE
+//   floating-point arithmetic"
+//   ACM Transactions on Mathematical Software (TOMS)
+//   Volume 16, Issue 4 (December 1990)
+//
+//
+//   x very close to 1.0 is handled differently, for x everywhere else
+//   a brief explanation is given below
+//
+//   x = (2^m)*A
+//   x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8))
+//   x = (2^m)*2*(G/2+g/2)
+//   x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9))
+//
+//   Y = (2^(-1))*(2^(-m))*(2^m)*A
+//   Now, range of Y is: 0.5 <= Y < 1
+//
+//   F = 0x80 + (first 7 mantissa bits) + (8th mantissa bit)
+//   Now, range of F is: 128 <= F <= 256 
+//   F = F / 256 
+//   Now, range of F is: 0.5 <= F <= 1
+//
+//   f = -(Y-F), with (f <= 2^(-9))
+//
+//   log(x) = m*log(2) + log(2) + log(F-f)
+//   log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
+//   log(x) = m*log(2) + log(2*F) + log(1-r)
+//
+//   r = (f/F), with (r <= 2^(-8))
+//   r = f*(1/F) with (1/F) precomputed to avoid division
+//
+//   log(x) = m*log(2) + log(G) - poly
+//
+//   log(G) is precomputed
+//   poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5))
+//
+//   log(2) and log(G) need to be maintained in extra precision
+//   to avoid losing precision in the calculations
+//
+//
+//   For x close to 1.0, we employ the following technique to
+//   ensure faster convergence.
+//
+//   log(x) = log((1+s)/(1-s)) = 2*s + (2/3)*s^3 + (2/5)*s^5 + (2/7)*s^7
+//   x = ((1+s)/(1-s)) 
+//   x = 1 + r
+//   s = r/(2+r)
+
+__attribute__((overloadable)) double
+#if defined(COMPILING_LOG2)
+log2(double x)
+#elif defined(COMPILING_LOG10)
+log10(double x)
+#else
+log(double x)
+#endif
+{
+    USE_TABLE(double2, p_tbl, LN_TBL);
+
+#ifndef COMPILING_LOG2
+    // log2_lead and log2_tail sum to an extra-precise version of ln(2)
+    const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */
+    const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */
+#endif
+
+#if defined(COMPILING_LOG10)
+    // log10e_lead and log10e_tail sum to an extra-precision version of log10(e) (19 bits in lead)
+    const double log10e_lead = 4.34293746948242187500e-01;  /* 0x3fdbcb7800000000 */
+    const double log10e_tail = 7.3495500964015109100644e-7; /* 0x3ea8a93728719535 */
+#elif defined(COMPILING_LOG2)
+    // log2e_lead and log2e_tail sum to an extra-precision version of log2(e) (19 bits in lead)
+    const double log2e_lead = 1.44269180297851562500E+00; /* 0x3FF7154400000000 */
+    const double log2e_tail = 3.23791044778235969970E-06; /* 0x3ECB295C17F0BBBE */
+#endif
+
+    // log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
+    // log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 
+    const double log_thresh1 = 0x1.e0faap-1;
+    const double log_thresh2 = 0x1.1082cp+0;
+
+    int is_near = x >= log_thresh1 & x <= log_thresh2;
+
+    // Near 1 code
+    double r = x - 1.0;
+    double u = r / (2.0 + r);
+    double correction = r * u;
+    u = u + u;
+    double v = u * u;
+    double r1 = r;
+
+    const double ca_1 = 8.33333333333317923934e-02; /* 0x3fb55555555554e6 */
+    const double ca_2 = 1.25000000037717509602e-02; /* 0x3f89999999bac6d4 */
+    const double ca_3 = 2.23213998791944806202e-03; /* 0x3f62492307f1519f */
+    const double ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */
+
+    double r2 = fma(u*v, fma(v, fma(v, fma(v, ca_4, ca_3), ca_2), ca_1), -correction);
+
+#if defined(COMPILING_LOG10)
+    r = r1;
+    r1 = as_double(as_ulong(r1) & 0xffffffff00000000);
+    r2 = r2 + (r - r1);
+    double ret_near = fma(log10e_lead, r1, fma(log10e_lead, r2, fma(log10e_tail, r1, log10e_tail * r2)));
+#elif defined(COMPILING_LOG2)
+    r = r1;
+    r1 = as_double(as_ulong(r1) & 0xffffffff00000000);
+    r2 = r2 + (r - r1);
+    double ret_near = fma(log2e_lead, r1, fma(log2e_lead, r2, fma(log2e_tail, r1, log2e_tail*r2)));
+#else
+    double ret_near = r1 + r2;
+#endif
+
+    // This is the far from 1 code
+
+    // Deal with subnormal
+    ulong ux = as_ulong(x);
+    ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962);
+    int c = ux < IMPBIT_DP64;
+    ux = c ? uxs : ux;
+    int expadjust = c ? 60 : 0;
+
+    int xexp = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust;
+    double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64));
+    int index = as_int2(ux).hi >> 13;
+    index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1);
+
+    double2 tv = p_tbl[index - 64];
+    double z1 = tv.s0;
+    double q = tv.s1;
+
+    double f1 = index * 0x1.0p-7;
+    double f2 = f - f1;
+    u = f2 / fma(f2, 0.5, f1);
+    v = u * u;
+
+    const double cb_1 = 8.33333333333333593622e-02; /* 0x3fb5555555555557 */
+    const double cb_2 = 1.24999999978138668903e-02; /* 0x3f89999999865ede */
+    const double cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */
+
+    double poly = v * fma(v, fma(v, cb_3, cb_2), cb_1);
+    double z2 = q + fma(u, poly, u);
+
+    double dxexp = (double)xexp;
+#if defined (COMPILING_LOG10)
+    // Add xexp * log(2) to z1,z2 to get log(x)
+    r1 = fma(dxexp, log2_lead, z1);
+    r2 = fma(dxexp, log2_tail, z2);
+    double ret_far = fma(log10e_lead, r1, fma(log10e_lead, r2, fma(log10e_tail, r1, log10e_tail*r2)));
+#elif defined(COMPILING_LOG2)
+    r1 = fma(log2e_lead, z1, dxexp);
+    r2 = fma(log2e_lead, z2, fma(log2e_tail, z1, log2e_tail*z2));
+    double ret_far = r1 + r2;
+#else
+    r1 = fma(dxexp, log2_lead, z1);
+    r2 = fma(dxexp, log2_tail, z2);
+    double ret_far = r1 + r2;
+#endif
+
+    double ret = is_near ? ret_near : ret_far;
+
+    ret = isinf(x) ? as_double(PINFBITPATT_DP64) : ret;
+    ret = isnan(x) | x < 0.0 ? as_double(QNANBITPATT_DP64) : ret;
+    ret = x == 0.0 ? as_double(NINFBITPATT_DP64) : ret;
+    return ret;
+}
+

diff --git a/amd-builtins/math64/logD_table.h b/amd-builtins/math64/logD_table.h
new file mode 100644
index 0000000..a7ccc20
--- /dev/null
+++ b/amd-builtins/math64/logD_table.h

@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+DECLARE_TABLE(double2, LN_TBL, 65,
+    (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.fc0a800000000p-7, 0x1.61f807c79f3dbp-28),
+    (double2)(0x1.f829800000000p-6, 0x1.873c1980267c8p-25),
+    (double2)(0x1.7745800000000p-5, 0x1.ec65b9f88c69ep-26),
+    (double2)(0x1.f0a3000000000p-5, 0x1.8022c54cc2f99p-26),
+    (double2)(0x1.341d700000000p-4, 0x1.2c37a3a125330p-25),
+    (double2)(0x1.6f0d200000000p-4, 0x1.15cad69737c93p-25),
+    (double2)(0x1.a926d00000000p-4, 0x1.d256ab1b285e9p-27),
+    (double2)(0x1.e270700000000p-4, 0x1.b8abcb97a7aa2p-26),
+    (double2)(0x1.0d77e00000000p-3, 0x1.f34239659a5dcp-25),
+    (double2)(0x1.2955280000000p-3, 0x1.e07fd48d30177p-25),
+    (double2)(0x1.44d2b00000000p-3, 0x1.b32df4799f4f6p-25),
+    (double2)(0x1.5ff3000000000p-3, 0x1.c29e4f4f21cf8p-25),
+    (double2)(0x1.7ab8900000000p-3, 0x1.086c848df1b59p-30),
+    (double2)(0x1.9525a80000000p-3, 0x1.cf456b4764130p-27),
+    (double2)(0x1.af3c900000000p-3, 0x1.3a02ffcb63398p-25),
+    (double2)(0x1.c8ff780000000p-3, 0x1.1e6a6886b0976p-25),
+    (double2)(0x1.e270700000000p-3, 0x1.b8abcb97a7aa2p-25),
+    (double2)(0x1.fb91800000000p-3, 0x1.b578f8aa35552p-25),
+    (double2)(0x1.0a324c0000000p-2, 0x1.139c871afb9fcp-25),
+    (double2)(0x1.1675c80000000p-2, 0x1.5d5d30701ce64p-25),
+    (double2)(0x1.22941c0000000p-2, 0x1.de7bcb2d12142p-25),
+    (double2)(0x1.2e8e280000000p-2, 0x1.d708e984e1664p-25),
+    (double2)(0x1.3a64c40000000p-2, 0x1.56945e9c72f36p-26),
+    (double2)(0x1.4618bc0000000p-2, 0x1.0e2f613e85bdap-29),
+    (double2)(0x1.51aad80000000p-2, 0x1.cb7e0b42724f6p-28),
+    (double2)(0x1.5d1bd80000000p-2, 0x1.fac04e52846c7p-25),
+    (double2)(0x1.686c800000000p-2, 0x1.e9b14aec442bep-26),
+    (double2)(0x1.739d7c0000000p-2, 0x1.b5de8034e7126p-25),
+    (double2)(0x1.7eaf800000000p-2, 0x1.dc157e1b259d3p-25),
+    (double2)(0x1.89a3380000000p-2, 0x1.b05096ad69c62p-28),
+    (double2)(0x1.9479400000000p-2, 0x1.c2116faba4cddp-26),
+    (double2)(0x1.9f323c0000000p-2, 0x1.65fcc25f95b47p-25),
+    (double2)(0x1.a9cec80000000p-2, 0x1.a9a08498d4850p-26),
+    (double2)(0x1.b44f740000000p-2, 0x1.de647b1465f77p-25),
+    (double2)(0x1.beb4d80000000p-2, 0x1.da71b7bf7861dp-26),
+    (double2)(0x1.c8ff7c0000000p-2, 0x1.e6a6886b09760p-28),
+    (double2)(0x1.d32fe40000000p-2, 0x1.f0075eab0ef64p-25),
+    (double2)(0x1.dd46a00000000p-2, 0x1.3071282fb989bp-28),
+    (double2)(0x1.e744240000000p-2, 0x1.0eb43c3f1bed2p-25),
+    (double2)(0x1.f128f40000000p-2, 0x1.faf06ecb35c84p-26),
+    (double2)(0x1.faf5880000000p-2, 0x1.ef1e63db35f68p-27),
+    (double2)(0x1.02552a0000000p-1, 0x1.69743fb1a71a5p-27),
+    (double2)(0x1.0723e40000000p-1, 0x1.c1cdf404e5796p-25),
+    (double2)(0x1.0be72e0000000p-1, 0x1.094aa0ada625ep-27),
+    (double2)(0x1.109f380000000p-1, 0x1.e2d4c96fde3ecp-25),
+    (double2)(0x1.154c3c0000000p-1, 0x1.2f4d5e9a98f34p-25),
+    (double2)(0x1.19ee6a0000000p-1, 0x1.467c96ecc5cbep-25),
+    (double2)(0x1.1e85f40000000p-1, 0x1.e7040d03dec5ap-25),
+    (double2)(0x1.23130c0000000p-1, 0x1.7bebf4282de36p-25),
+    (double2)(0x1.2795e00000000p-1, 0x1.289b11aeb783fp-25),
+    (double2)(0x1.2c0e9e0000000p-1, 0x1.a891d1772f538p-26),
+    (double2)(0x1.307d720000000p-1, 0x1.34f10be1fb591p-25),
+    (double2)(0x1.34e2880000000p-1, 0x1.d9ce1d316eb93p-25),
+    (double2)(0x1.393e0c0000000p-1, 0x1.3562a19a9c442p-25),
+    (double2)(0x1.3d90260000000p-1, 0x1.4e2adf548084cp-26),
+    (double2)(0x1.41d8fe0000000p-1, 0x1.08ce55cc8c97ap-26),
+    (double2)(0x1.4618bc0000000p-1, 0x1.0e2f613e85bdap-28),
+    (double2)(0x1.4a4f840000000p-1, 0x1.db03ebb0227bfp-25),
+    (double2)(0x1.4e7d800000000p-1, 0x1.1b75bb09cb098p-25),
+    (double2)(0x1.52a2d20000000p-1, 0x1.96f16abb9df22p-27),
+    (double2)(0x1.56bf9c0000000p-1, 0x1.5b3f399411c62p-25),
+    (double2)(0x1.5ad4040000000p-1, 0x1.86b3e59f65355p-26),
+    (double2)(0x1.5ee02a0000000p-1, 0x1.2482ceae1ac12p-26),
+    (double2)(0x1.62e42e0000000p-1, 0x1.efa39ef35793cp-25),
+)
+

diff --git a/amd-builtins/math64/logbD.cl b/amd-builtins/math64/logbD.cl
new file mode 100644
index 0000000..895a621
--- /dev/null
+++ b/amd-builtins/math64/logbD.cl

@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) double
+logb(double x)
+{
+    long ax = as_long(x) & EXSIGNBIT_DP64;
+    double s = -1011L - clz(ax);
+    double r = (int)(ax >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
+    r = ax >= PINFBITPATT_DP64 ? as_double(ax) : r;
+    r = ax < 0x0010000000000000L ? s : r;
+    r = ax == 0L ? as_double(NINFBITPATT_DP64) : r;
+    return r;
+}
+

diff --git a/amd-builtins/math64/madD.cl b/amd-builtins/math64/madD.cl
new file mode 100644
index 0000000..5cb6d87
--- /dev/null
+++ b/amd-builtins/math64/madD.cl

@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) double
+mad(double x, double y, double z)
+{
+    return __amdil_mad_f64(x, y, z);
+}
+

diff --git a/amd-builtins/math64/math64.h b/amd-builtins/math64/math64.h
new file mode 100644
index 0000000..d6bae16
--- /dev/null
+++ b/amd-builtins/math64/math64.h

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MATH64_H
+#define MATH64_H 1
+
+extern __attribute__((pure)) double __amdil_copysign_f64(double, double);
+extern __attribute__((pure)) double __amdil_fma_f64(double, double, double);
+extern __attribute__((pure)) double __amdil_mad_f64(double, double, double);
+extern __attribute__((pure)) double __amdil_max_f64(double, double);
+extern __attribute__((pure)) double __amdil_min_f64(double, double);
+extern __attribute__((pure)) double __amdil_fraction_f64(double);
+extern __attribute__((pure)) double __amdil_fabs_f64(double);
+extern __attribute__((pure)) double __amdil_round_nearest_f64(double);
+extern __attribute__((pure)) double __amdil_round_neginf_f64(double);
+extern __attribute__((pure)) double __amdil_round_posinf_f64(double);
+extern __attribute__((pure)) double __amdil_round_zero_f64(double);
+extern __attribute__((pure)) double __amdil_rsq_f64(double);
+
+extern __attribute__((pure)) int __amdil_class_f64(double, int);
+
+#define SNAN 0x001
+#define QNAN 0x002
+#define NINF 0x004
+#define NNOR 0x008
+#define NSUB 0x010
+#define NZER 0x020
+#define PZER 0x040
+#define PSUB 0x080
+#define PNOR 0x100
+#define PINF 0x200
+
+// Allow control over how division is done
+#define MATH_DIVIDE(X,Y) ((X) / (Y))
+#define MATH_RECIP(X) (1.0 / (X))
+
+// // Allow control over square root
+#define MATH_SQRT(X) sqrt(X)
+
+// Table stuff
+#define TABLE_SPACE __constant
+
+#define TABLE_MANGLE(NAME) __math64_##NAME
+
+#define USE_TABLE(TYPE,PTR,NAME) \
+    extern TABLE_SPACE TYPE TABLE_MANGLE(NAME) []; \
+    TABLE_SPACE TYPE * PTR = TABLE_MANGLE(NAME)
+
+#define DECLARE_TABLE(TYPE,NAME,LENGTH,...) \
+    TABLE_SPACE TYPE TABLE_MANGLE(NAME) [ LENGTH ] = { __VA_ARGS__ };
+
+/* Definitions for double functions on 64 bit machines */
+#define SIGNBIT_DP64      0x8000000000000000L
+#define EXSIGNBIT_DP64    0x7fffffffffffffffL
+#define EXPBITS_DP64      0x7ff0000000000000L
+#define MANTBITS_DP64     0x000fffffffffffffL
+#define ONEEXPBITS_DP64   0x3ff0000000000000L
+#define TWOEXPBITS_DP64   0x4000000000000000L
+#define HALFEXPBITS_DP64  0x3fe0000000000000L
+#define IMPBIT_DP64       0x0010000000000000L
+#define QNANBITPATT_DP64  0x7ff8000000000000L
+#define INDEFBITPATT_DP64 0xfff8000000000000L
+#define PINFBITPATT_DP64  0x7ff0000000000000L
+#define NINFBITPATT_DP64  0xfff0000000000000L
+#define EXPBIAS_DP64      1023
+#define EXPSHIFTBITS_DP64 52
+#define BIASEDEMIN_DP64   1
+#define EMIN_DP64         -1022
+#define BIASEDEMAX_DP64   2046 /* 0x7fe */
+#define EMAX_DP64         1023 /* 0x3ff */
+#define LAMBDA_DP64       1.0e300
+#define MANTLENGTH_DP64   53
+#define BASEDIGITS_DP64   15
+
+#define ALIGNED(x)	__attribute__((aligned(x)))
+
+#endif /* MATH64_H */
+

diff --git a/amd-builtins/math64/maxmagD.cl b/amd-builtins/math64/maxmagD.cl
new file mode 100644
index 0000000..7a261cc
--- /dev/null
+++ b/amd-builtins/math64/maxmagD.cl

@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) double
+maxmag(double x, double y)
+{
+    long ix = as_long(x);
+    long iy = as_long(y);
+    long ax = ix & 0x7fffffffffffffffL;
+    long ay = iy & 0x7fffffffffffffffL;
+    ax |= -(ax > 0x7ff0000000000000L);
+    ay |= -(ay > 0x7ff0000000000000L);
+    return as_double((-(ax > ay) & ix) |
+	             (-(ay > ax) & iy) |
+		     (-(ax == ay) & ((ix & iy) | (ax & 0x0008000000000000L))));
+}
+

diff --git a/amd-builtins/math64/minmagD.cl b/amd-builtins/math64/minmagD.cl
new file mode 100644
index 0000000..e357071
--- /dev/null
+++ b/amd-builtins/math64/minmagD.cl

@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) double
+minmag(double x, double y)
+{
+    long ix = as_long(x);
+    long iy = as_long(y);
+    long ax = ix & 0x7fffffffffffffffL;
+    long ay = iy & 0x7fffffffffffffffL;
+    return as_double((-(ax < ay) & ix) |
+	            (-(ay < ax) & iy) |
+		    (-(ax == ay) & (ix | iy)));
+}
+

diff --git a/amd-builtins/math64/modfD.cl b/amd-builtins/math64/modfD.cl
new file mode 100644
index 0000000..2c21a8a
--- /dev/null
+++ b/amd-builtins/math64/modfD.cl

@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) double
+modf(double x, double *iptr)
+{
+    long ux = as_long(x);
+    int e = ((int)(ux >> 52) & 0x7ff) - 1023;
+    long s = ux & 0x8000000000000000L;
+    long msk = 0xffffffffffffffffL << (52 - e);
+    long i = msk & ux;
+    long r = as_long(x - as_double(i));
+
+    r = e < 0 ? ux : r;
+    i = e < 0 ? s : i;
+
+    r = e >= 52 ? s : r;
+    i = e >= 52 ? ux : i;
+
+    r = (ux & 0x7fffffffffffffffL) > 0x7ff0000000000000L ? ux : r;
+
+    *iptr = as_double(i);
+    return as_double(r);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline)) double
+modf(double x, __global double *iptr)
+{
+    double i;
+    double f = modf(x, &i);
+    *iptr = i;
+    return f;
+}
+
+__attribute__((overloadable, always_inline)) double
+modf(double x, __local double *iptr)
+{
+    double i;
+    double f = modf(x, &i);
+    *iptr = i;
+    return f;
+}
+#endif
+

diff --git a/amd-builtins/math64/nanD.cl b/amd-builtins/math64/nanD.cl
new file mode 100644
index 0000000..394e807
--- /dev/null
+++ b/amd-builtins/math64/nanD.cl

@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) double
+nan(ulong nancode)
+{
+    return as_double((nancode & 0x000fffffffffffffUL) | 0x7ff8000000000000UL);
+}
+

diff --git a/amd-builtins/math64/nextafterD.cl b/amd-builtins/math64/nextafterD.cl
new file mode 100644
index 0000000..6863004
--- /dev/null
+++ b/amd-builtins/math64/nextafterD.cl

@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) double
+nextafter(double x, double y)
+{
+    long ix = as_long(x);
+    long ax = ix & 0x7fffffffffffffffL;
+    long mx = 0x8000000000000000L - ix;
+    mx = ix < 0 ? mx : ix;
+    long iy = as_long(y);
+    long ay = iy & 0x7fffffffffffffffL;
+    long my = 0x8000000000000000L - iy;
+    my = iy < 0 ? my : iy;
+    long t = mx + (mx < my ? 1 : -1);
+    long r = 0x8000000000000000L - t;
+    r = t < 0 ? r : t;
+    r = ax > 0x7ff0000000000000L ? ix : r;
+    r = ay > 0x7ff0000000000000L ? iy : r;
+    r = (ax|ay) == 0L | ix == iy ? iy : r;
+    return as_double(r);
+}
+

diff --git a/amd-builtins/math64/pibits64.h b/amd-builtins/math64/pibits64.h
new file mode 100644
index 0000000..e383a54
--- /dev/null
+++ b/amd-builtins/math64/pibits64.h

@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+DECLARE_TABLE(uchar, PIBITS,,
+    224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175,
+    169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31,
+    235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38,
+    44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188,
+    188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247,
+    46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249,
+    125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8,
+    162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39,
+    168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
+    239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109,
+    3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13,
+    230, 139, 2, 0, 0, 0, 0, 0, 0, 0
+)
+

diff --git a/amd-builtins/math64/powD.cl b/amd-builtins/math64/powD.cl
new file mode 100644
index 0000000..e078d87
--- /dev/null
+++ b/amd-builtins/math64/powD.cl

@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "powD_base.h"
+

diff --git a/amd-builtins/math64/powD_base.h b/amd-builtins/math64/powD_base.h
new file mode 100644
index 0000000..ec34bad
--- /dev/null
+++ b/amd-builtins/math64/powD_base.h

@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline, weak)) double
+#if defined(COMPILING_POWR)
+powr(double x, double y)
+#elif defined(COMPILING_POWN)
+pown(double x, int ny)
+#elif defined(COMPILING_ROOTN)
+rootn(double x, int ny)
+#else
+pow(double x, double y)
+#endif
+{
+    const double real_log2_tail = 5.76999904754328540596e-08;
+    const double real_log2_lead = 6.93147122859954833984e-01;
+
+    USE_TABLE(double2, p_powlog_tbl, POWLOG_TBL);
+    USE_TABLE(double2, p_log_F_inv_tbl, LOG_F_INV_TBL);
+    USE_TABLE(double2, p_two_to, TWO_TO_JBY64_EP);
+
+#if defined(COMPILING_POWN)
+    double y = (double) ny;
+#elif defined(COMPILING_ROOTN)
+    double dny = (double)ny;
+    double y = 1.0 / dny;
+#endif
+
+    long ux = as_long(x);
+    long ax = ux & (~SIGNBIT_DP64);
+    int xpos = ax == ux;
+
+    long uy = as_long(y);
+    long ay = uy & (~SIGNBIT_DP64);
+    int ypos = ay == uy;
+
+    // Extended precision log
+    double v, vt;
+    {
+        int exp = (int)(ax >> 52) - 1023;
+        int mask_exp_1023 = exp == -1023;
+        double xexp = (double) exp;
+        long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+        long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+        exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+        double xexp1 = (double) exp;
+        long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+        xexp = mask_exp_1023 ? xexp1 : xexp;
+        mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+        long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
+        int index = rax >> 44;
+
+        double F = as_double(rax | 0x3FE0000000000000L);
+        double Y = as_double(mantissa | 0x3FE0000000000000L);
+        double f = F - Y;
+        double2 tv = p_log_F_inv_tbl[index];
+        double log_h = tv.s0;
+        double log_t = tv.s1;
+        double f_inv = (log_h + log_t) * f;
+        double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+        double r2 = fma(-F, r1, f) * (log_h + log_t);
+        double r = r1 + r2;
+
+        double poly = fma(r,
+                          fma(r,
+                              fma(r,
+                                  fma(r, 1.0/7.0, 1.0/6.0),
+                                  1.0/5.0),
+                              1.0/4.0),
+                          1.0/3.0);
+        poly = poly * r * r * r;
+
+        double hr1r1 = 0.5*r1*r1;
+        double poly0h = r1 + hr1r1;
+        double poly0t = r1 - poly0h + hr1r1;
+	poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
+
+        tv = p_powlog_tbl[index];
+        log_h = tv.s0;
+        log_t = tv.s1;
+
+        double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
+        double resT = resT_t - poly0h;
+        double resH = fma(xexp, real_log2_lead, log_h);
+        double resT_h = poly0h;
+
+        double H = resT + resH;
+        double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+        double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+        H = H_h;
+
+        double y_head = as_double(uy & 0xfffffffff8000000L);
+        double y_tail = y - y_head;
+
+#if defined(COMPILING_POWN)
+        int mask_2_24 = ay > 0x4170000000000000; // 2^24
+        int nyh = convert_int(y_head);
+        int nyt = ny - nyh;
+        double y_tail1 = (double)nyt;
+        y_tail = mask_2_24 ? y_tail1 : y_tail;
+#endif
+
+#if defined(COMPILING_ROOTN)
+        double fnyh = as_double(as_long(dny) & 0xfffffffffff00000);
+        double fnyt = (double)(ny - (int)fnyh);
+        y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0))/ dny;
+#endif
+
+        double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
+        v = fma(y_head, H, temp);
+        vt = fma(y_head, H, -v) + temp;
+    }
+
+    // Now calculate exp of (v,vt)
+
+    double expv;
+    {
+        const double max_exp_arg = 709.782712893384;
+        const double min_exp_arg = -745.1332191019411;
+        const double sixtyfour_by_lnof2 = 92.33248261689366;
+        const double lnof2_by_64_head = 0.010830424260348081;
+        const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+        double temp = v * sixtyfour_by_lnof2;
+        int n = (int)temp;
+        double dn = (double)n;
+        int j = n & 0x0000003f;
+        int m = n >> 6;
+
+        double2 tv = p_two_to[j];
+        double f1 = tv.s0;
+        double f2 = tv.s1;
+        double f = f1 + f2;
+
+        double r1 = fma(dn, -lnof2_by_64_head, v);
+        double r2 = dn * lnof2_by_64_tail;
+        double r = (r1 + r2) + vt;
+
+        double q = fma(r,
+                       fma(r,
+                           fma(r,
+                               fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+                               4.16666666662260795726e-02),
+                           1.66666666665260878863e-01),
+                       5.00000000000000008883e-01);
+        q = fma(r*r, q, r);
+
+        expv = fma(f, q, f2) + f1;
+	      expv = ldexp(expv, m);
+
+        expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+        expv = v < min_exp_arg ? 0.0 : expv;
+    }
+
+    // See whether y is an integer.
+    // inty = 0 means not an integer.
+    // inty = 1 means odd integer.
+    // inty = 2 means even integer.
+
+#if defined(COMPILING_POWN) | defined(COMPILING_ROOTN)
+    int inty = 2 - (ny & 1);
+#else
+    int inty;
+    {
+        int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
+        inty = yexp < 1 ? 0 : 2;
+        inty = yexp > 53 ? 2 : inty;
+        long mask = (1L << (53 - yexp)) - 1L;
+        int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
+        inty1 = (ay & mask) != 0 ? 0 : inty1;
+        inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
+    }
+#endif
+
+    expv *= inty == 1 & !xpos ? -1.0 : 1.0;
+
+    long ret = as_long(expv);
+
+    // Now all the edge cases
+
+#if defined COMPILING_POWR
+    ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
+    ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
+    ret = ax == 0x3ff0000000000000L & ay < PINFBITPATT_DP64 ? 0x3ff0000000000000L : ret;
+    ret = ax == 0x3ff0000000000000L & ay == PINFBITPATT_DP64 ? QNANBITPATT_DP64 : ret;
+    ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
+    ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
+    ret = ux < PINFBITPATT_DP64 & ay == 0L ? 0x3ff0000000000000L : ret;
+    ret = ax == PINFBITPATT_DP64 & !ypos ? 0L : ret;
+    ret = ax == PINFBITPATT_DP64 & ypos ? PINFBITPATT_DP64 : ret;
+    ret = ax == PINFBITPATT_DP64 & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
+    ret = ax == PINFBITPATT_DP64 & ay == 0L ? QNANBITPATT_DP64 : ret;
+    ret = ax == 0L & !ypos ? PINFBITPATT_DP64 : ret;
+    ret = ax == 0L & ypos ? 0L : ret;
+    ret = ax == 0L & ay == 0L ? QNANBITPATT_DP64 : ret;
+    ret = ax != 0L & !xpos ? QNANBITPATT_DP64 : ret;
+    ret = ax > PINFBITPATT_DP64 ? ux : ret;
+    ret = ay > PINFBITPATT_DP64 ? uy : ret;
+#elif defined COMPILING_POWN
+    long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+    ret = ax == 0L & !ypos & inty == 1 ? xinf : ret;
+    ret = ax == 0L & !ypos & inty == 2 ? PINFBITPATT_DP64 : ret;
+    ret = ax == 0L & ypos & inty == 2 ? 0L : ret;
+    long xzero = !xpos ? 0x8000000000000000L : 0L;
+    ret = ax == 0L & ypos & inty == 1 ? xzero : ret;
+    ret = ux == NINFBITPATT_DP64 & !ypos & inty == 1 ? 0x8000000000000000L : ret;
+    ret = ux == NINFBITPATT_DP64 & !ypos & inty != 1 ? 0L : ret;
+    ret = ux == NINFBITPATT_DP64 & ypos & inty == 1 ? NINFBITPATT_DP64 : ret;
+    ret = ux == NINFBITPATT_DP64 & ypos & inty != 1 ? PINFBITPATT_DP64 : ret;
+    ret = ux == PINFBITPATT_DP64 & !ypos ? 0L : ret;
+    ret = ux == PINFBITPATT_DP64 & ypos ? PINFBITPATT_DP64 : ret;
+    ret = ax > PINFBITPATT_DP64 ? ux : ret;
+    ret = ny == 0 ? 0x3ff0000000000000L : ret;
+#elif defined COMPILING_ROOTN
+    ret = !xpos & inty == 2 ? QNANBITPATT_DP64 : ret;
+    long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+    ret = ax == 0L & !ypos & inty == 1 ? xinf : ret;
+    ret = ax == 0L & !ypos & inty == 2 ? PINFBITPATT_DP64 : ret;
+    ret = ax == 0L & ypos & inty == 2 ? 0L : ret;
+    long xzero = xpos ? 0L : 0x8000000000000000L;
+    ret = ax == 0L & ypos & inty == 1 ? xzero : ret;
+    ret = ux == NINFBITPATT_DP64 & ypos & inty == 1 ? NINFBITPATT_DP64 : ret;
+    ret = ux == NINFBITPATT_DP64 & !ypos & inty == 1 ? 0x8000000000000000L : ret;
+    ret = ux == PINFBITPATT_DP64 & !ypos ? 0L : ret;
+    ret = ux == PINFBITPATT_DP64 & ypos ? PINFBITPATT_DP64 : ret;
+    ret = ax > PINFBITPATT_DP64 ? ux : ret;
+    ret = ny == 0 ? QNANBITPATT_DP64 : ret;
+#else
+    ret = !xpos & inty == 0 ? QNANBITPATT_DP64 : ret;
+    ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
+    ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
+    ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
+    ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
+    long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+    ret = ax == 0L & !ypos & inty == 1 ? xinf : ret;
+    ret = ax == 0L & !ypos & inty != 1 ? PINFBITPATT_DP64 : ret;
+    long xzero = xpos ? 0L : 0x8000000000000000L;
+    ret = ax == 0L & ypos & inty == 1 ? xzero : ret;
+    ret = ax == 0L & ypos & inty != 1 ? 0L : ret;
+    ret = ax == 0L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
+    ret = ux == 0xbff0000000000000L & ay == PINFBITPATT_DP64 ? 0x3ff0000000000000L : ret;
+    ret = ux == NINFBITPATT_DP64 & !ypos & inty == 1 ? 0x8000000000000000L : ret;
+    ret = ux == NINFBITPATT_DP64 & !ypos & inty != 1 ? 0L : ret;
+    ret = ux == NINFBITPATT_DP64 & ypos & inty == 1 ? NINFBITPATT_DP64 : ret;
+    ret = ux == NINFBITPATT_DP64 & ypos & inty != 1 ? PINFBITPATT_DP64 : ret;
+    ret = ux == PINFBITPATT_DP64 & !ypos ? 0L : ret;
+    ret = ux == PINFBITPATT_DP64 & ypos ? PINFBITPATT_DP64 : ret;
+    ret = ax > PINFBITPATT_DP64 ? ux : ret;
+    ret = ay > PINFBITPATT_DP64 ? uy : ret;
+    ret = ay == 0L ? 0x3ff0000000000000L : ret;
+    ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret;
+#endif
+
+    return as_double(ret);
+}
+

diff --git a/amd-builtins/math64/powD_table.h b/amd-builtins/math64/powD_table.h
new file mode 100644
index 0000000..95c9d5d
--- /dev/null
+++ b/amd-builtins/math64/powD_table.h

@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+DECLARE_TABLE(double2, POWLOG_TBL, 258,
+    (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.ff00aa0000000p-9, 0x1.5885e0250435ap-36),
+    (double2)(0x1.fe02a60000000p-8, 0x1.620cf11f86ed2p-33),
+    (double2)(0x1.7dc4750000000p-7, 0x1.f0214edba4a25p-32),
+    (double2)(0x1.fc0a8b0000000p-7, 0x1.f807c79f3db4ep-36),
+    (double2)(0x1.3cea440000000p-6, 0x1.a352ba779a52bp-33),
+    (double2)(0x1.7b91b00000000p-6, 0x1.f56c46aa49fd5p-32),
+    (double2)(0x1.b9fc020000000p-6, 0x1.ebe465fef5196p-32),
+    (double2)(0x1.f829b00000000p-6, 0x1.cf0660099f1f8p-31),
+    (double2)(0x1.1b0d980000000p-5, 0x1.247b2ff85945dp-30),
+    (double2)(0x1.39e87b0000000p-5, 0x1.3fd7abf5202b6p-30),
+    (double2)(0x1.58a5ba0000000p-5, 0x1.f91c9a918d51ep-30),
+    (double2)(0x1.77458f0000000p-5, 0x1.8cb73f118d3cap-31),
+    (double2)(0x1.95c8300000000p-5, 0x1.d91c7d6fad074p-30),
+    (double2)(0x1.b42dd70000000p-5, 0x1.1971bec28d14cp-33),
+    (double2)(0x1.d276b80000000p-5, 0x1.5b616a423c78ap-30),
+    (double2)(0x1.f0a30c0000000p-5, 0x1.162a6617cc971p-37),
+    (double2)(0x1.0759830000000p-4, 0x1.66391c4c06d29p-30),
+    (double2)(0x1.16536e0000000p-4, 0x1.d46f5c1d0c4b8p-29),
+    (double2)(0x1.253f620000000p-4, 0x1.e14282df1f6d3p-29),
+    (double2)(0x1.341d790000000p-4, 0x1.86f47424a660dp-30),
+    (double2)(0x1.42edcb0000000p-4, 0x1.d4c8de077753ep-29),
+    (double2)(0x1.51b0730000000p-4, 0x1.e0c307ed24f1cp-29),
+    (double2)(0x1.60658a0000000p-4, 0x1.26ea18763bdd3p-29),
+    (double2)(0x1.6f0d280000000p-4, 0x1.5cad69737c933p-29),
+    (double2)(0x1.7da7660000000p-4, 0x1.af62599088901p-29),
+    (double2)(0x1.8c345d0000000p-4, 0x1.8c66c83d6b2d0p-30),
+    (double2)(0x1.9ab4240000000p-4, 0x1.880ceb36fb30fp-30),
+    (double2)(0x1.a926d30000000p-4, 0x1.495aac6ca17a4p-29),
+    (double2)(0x1.b78c820000000p-4, 0x1.761db4210878cp-29),
+    (double2)(0x1.c5e5480000000p-4, 0x1.eb78e862bac2fp-29),
+    (double2)(0x1.d4313d0000000p-4, 0x1.9b2cd75790dd9p-30),
+    (double2)(0x1.e270760000000p-4, 0x1.c55e5cbd3d50fp-29),
+    (double2)(0x1.f0a30c0000000p-4, 0x1.162a6617cc971p-36),
+    (double2)(0x1.fec9130000000p-4, 0x1.dbeabaaa2e519p-32),
+    (double2)(0x1.0671510000000p-3, 0x1.652cb7150c647p-30),
+    (double2)(0x1.0d77e70000000p-3, 0x1.9a11cb2cd2ee2p-28),
+    (double2)(0x1.1478580000000p-3, 0x1.19d0ab1a28813p-29),
+    (double2)(0x1.1b72ad0000000p-3, 0x1.4bd9e80a41811p-29),
+    (double2)(0x1.2266f10000000p-3, 0x1.214b596faa3dfp-28),
+    (double2)(0x1.29552f0000000p-3, 0x1.03fea46980bb8p-28),
+    (double2)(0x1.303d710000000p-3, 0x1.1c8ffa5fd28c7p-28),
+    (double2)(0x1.371fc20000000p-3, 0x1.e8f743bcd96c5p-35),
+    (double2)(0x1.3dfc2b0000000p-3, 0x1.d98c5395315c6p-32),
+    (double2)(0x1.44d2b60000000p-3, 0x1.996fa3ccfa7b2p-28),
+    (double2)(0x1.4ba36f0000000p-3, 0x1.cd2af2ad13037p-30),
+    (double2)(0x1.526e5e0000000p-3, 0x1.d0da1bd17200ep-30),
+    (double2)(0x1.59338d0000000p-3, 0x1.330410ba68b75p-28),
+    (double2)(0x1.5ff3070000000p-3, 0x1.4f27a790e7c41p-32),
+    (double2)(0x1.66acd40000000p-3, 0x1.3956a86f6ff1bp-30),
+    (double2)(0x1.6d60fe0000000p-3, 0x1.c6748723551d9p-29),
+    (double2)(0x1.740f8f0000000p-3, 0x1.500de9326cdfcp-29),
+    (double2)(0x1.7ab8900000000p-3, 0x1.086c848df1b59p-30),
+    (double2)(0x1.815c0a0000000p-3, 0x1.4357ead6836ffp-31),
+    (double2)(0x1.87fa060000000p-3, 0x1.4832442408024p-29),
+    (double2)(0x1.8e928d0000000p-3, 0x1.d10da8154b13dp-28),
+    (double2)(0x1.9525a90000000p-3, 0x1.9e8ad68ec8260p-28),
+    (double2)(0x1.9bb3620000000p-3, 0x1.cfbf706abaf18p-28),
+    (double2)(0x1.a23bc10000000p-3, 0x1.fc56ac6326e23p-28),
+    (double2)(0x1.a8becf0000000p-3, 0x1.9105e3185cf21p-28),
+    (double2)(0x1.af3c940000000p-3, 0x1.d017fe5b19cc0p-28),
+    (double2)(0x1.b5b5190000000p-3, 0x1.d1f6b48dd13fep-28),
+    (double2)(0x1.bc28670000000p-3, 0x1.0b63358a7e73ap-29),
+    (double2)(0x1.c296850000000p-3, 0x1.63063028c211cp-29),
+    (double2)(0x1.c8ff7c0000000p-3, 0x1.e6a6886b09760p-29),
+    (double2)(0x1.cf63540000000p-3, 0x1.c138bb891cd03p-28),
+    (double2)(0x1.d5c2160000000p-3, 0x1.69f7722b7221ap-28),
+    (double2)(0x1.dc1bca0000000p-3, 0x1.57d8fac1a628cp-32),
+    (double2)(0x1.e270760000000p-3, 0x1.c55e5cbd3d50fp-28),
+    (double2)(0x1.e8c0250000000p-3, 0x1.552d2ff48fe2ep-30),
+    (double2)(0x1.ef0adc0000000p-3, 0x1.7b8b26ca431bcp-28),
+    (double2)(0x1.f550a50000000p-3, 0x1.92decdc1c5f6dp-29),
+    (double2)(0x1.fb91860000000p-3, 0x1.abc7c551aaa8cp-28),
+    (double2)(0x1.00e6c40000000p-2, 0x1.6b540731a354bp-28),
+    (double2)(0x1.0402590000000p-2, 0x1.2d341036b89efp-28),
+    (double2)(0x1.071b850000000p-2, 0x1.f9ab21a3a2e0fp-27),
+    (double2)(0x1.0a324e0000000p-2, 0x1.39c871afb9fbdp-29),
+    (double2)(0x1.0d46b50000000p-2, 0x1.e6add2c81f640p-28),
+    (double2)(0x1.1058bf0000000p-2, 0x1.35c95aa313f41p-27),
+    (double2)(0x1.1368700000000p-2, 0x1.49d4582f6cc53p-29),
+    (double2)(0x1.1675ca0000000p-2, 0x1.7574c1c07398fp-27),
+    (double2)(0x1.1980d20000000p-2, 0x1.ba846dece9e8dp-27),
+    (double2)(0x1.1c898c0000000p-2, 0x1.6999fafbc68e7p-30),
+    (double2)(0x1.1f8ff90000000p-2, 0x1.c9145e51b0103p-27),
+    (double2)(0x1.22941f0000000p-2, 0x1.79ef2cb44850ap-27),
+    (double2)(0x1.2596010000000p-2, 0x1.beec73de11275p-31),
+    (double2)(0x1.2895a10000000p-2, 0x1.ef4351af5a498p-29),
+    (double2)(0x1.2b93030000000p-2, 0x1.5713a493b4a50p-27),
+    (double2)(0x1.2e8e2b0000000p-2, 0x1.5c23a61385992p-27),
+    (double2)(0x1.31871c0000000p-2, 0x1.2a88309f57299p-27),
+    (double2)(0x1.347dd90000000p-2, 0x1.530faa9ac8acep-27),
+    (double2)(0x1.3772660000000p-2, 0x1.5fec2d792a758p-29),
+    (double2)(0x1.3a64c50000000p-2, 0x1.5a517a71cbcd7p-28),
+    (double2)(0x1.3d54fa0000000p-2, 0x1.707dc3e1cd9a3p-28),
+    (double2)(0x1.4043080000000p-2, 0x1.a1a9f8ef43049p-28),
+    (double2)(0x1.432ef20000000p-2, 0x1.409d0276b3674p-27),
+    (double2)(0x1.4618bc0000000p-2, 0x1.0e2f613e85bd9p-29),
+    (double2)(0x1.4900680000000p-2, 0x1.0027433001e5fp-32),
+    (double2)(0x1.4be5f90000000p-2, 0x1.5dde2836d3265p-28),
+    (double2)(0x1.4ec9730000000p-2, 0x1.300134d7aaf04p-29),
+    (double2)(0x1.51aad80000000p-2, 0x1.cb7e0b42724f5p-28),
+    (double2)(0x1.548a2c0000000p-2, 0x1.d6e93167e6308p-29),
+    (double2)(0x1.5767710000000p-2, 0x1.d1569b1526adbp-28),
+    (double2)(0x1.5a42ab0000000p-2, 0x1.e99fc338a1a41p-31),
+    (double2)(0x1.5d1bdb0000000p-2, 0x1.eb01394a11b1cp-27),
+    (double2)(0x1.5ff3070000000p-2, 0x1.4f27a790e7c41p-31),
+    (double2)(0x1.62c82f0000000p-2, 0x1.5ce3ca97b7af9p-29),
+    (double2)(0x1.659b570000000p-2, 0x1.81f0f940ed857p-29),
+    (double2)(0x1.686c810000000p-2, 0x1.d36295d88857cp-27),
+    (double2)(0x1.6b3bb20000000p-2, 0x1.1aca1ec4af526p-29),
+    (double2)(0x1.6e08ea0000000p-2, 0x1.45743c7182726p-27),
+    (double2)(0x1.70d42e0000000p-2, 0x1.3c491aead337ep-29),
+    (double2)(0x1.739d7f0000000p-2, 0x1.aef401a738931p-28),
+    (double2)(0x1.7664e10000000p-2, 0x1.1cede76092a29p-29),
+    (double2)(0x1.792a550000000p-2, 0x1.fba8f44f82bb4p-27),
+    (double2)(0x1.7bede00000000p-2, 0x1.46f5f7f3c3e1ap-27),
+    (double2)(0x1.7eaf830000000p-2, 0x1.7055f86c9674bp-27),
+    (double2)(0x1.816f410000000p-2, 0x1.b41a92b6b6e1ap-27),
+    (double2)(0x1.842d1d0000000p-2, 0x1.43d162e927628p-27),
+    (double2)(0x1.86e9190000000p-2, 0x1.466174013f9b1p-27),
+    (double2)(0x1.89a3380000000p-2, 0x1.b05096ad69c62p-28),
+    (double2)(0x1.8c5b7c0000000p-2, 0x1.0b169150faa58p-27),
+    (double2)(0x1.8f11e80000000p-2, 0x1.cd98b1df85da7p-28),
+    (double2)(0x1.91c67e0000000p-2, 0x1.68b507b0f8fa8p-27),
+    (double2)(0x1.9479410000000p-2, 0x1.8422df57499bap-27),
+    (double2)(0x1.972a340000000p-2, 0x1.1351586970274p-30),
+    (double2)(0x1.99d9580000000p-2, 0x1.17e08acba92eep-30),
+    (double2)(0x1.9c86b00000000p-2, 0x1.6e04314dd0229p-29),
+    (double2)(0x1.9f323e0000000p-2, 0x1.97f3097e56d1ap-27),
+    (double2)(0x1.a1dc060000000p-2, 0x1.356e655901286p-28),
+    (double2)(0x1.a484090000000p-2, 0x1.cb761457f94d6p-31),
+    (double2)(0x1.a72a490000000p-2, 0x1.9af67a85a9dacp-28),
+    (double2)(0x1.a9cec90000000p-2, 0x1.53410931a909fp-27),
+    (double2)(0x1.ac718c0000000p-2, 0x1.2c587206058f5p-29),
+    (double2)(0x1.af12930000000p-2, 0x1.23bc358899c22p-29),
+    (double2)(0x1.b1b1e00000000p-2, 0x1.d7bf8b6d223cbp-27),
+    (double2)(0x1.b44f770000000p-2, 0x1.7991ec5197ddbp-27),
+    (double2)(0x1.b6eb590000000p-2, 0x1.a79e6bb3a9219p-27),
+    (double2)(0x1.b985890000000p-2, 0x1.a4c43ed663ec5p-28),
+    (double2)(0x1.bc1e080000000p-2, 0x1.61b5a1484f438p-27),
+    (double2)(0x1.beb4d90000000p-2, 0x1.b4e36f7ef0c3ap-27),
+    (double2)(0x1.c149ff0000000p-2, 0x1.15f026acd0d1bp-30),
+    (double2)(0x1.c3dd7a0000000p-2, 0x1.f36b535cecf05p-28),
+    (double2)(0x1.c66f4e0000000p-2, 0x1.ffb7fbf3eb5c6p-29),
+    (double2)(0x1.c8ff7c0000000p-2, 0x1.e6a6886b09760p-28),
+    (double2)(0x1.cb8e070000000p-2, 0x1.135eb27f5bbc3p-28),
+    (double2)(0x1.ce1af00000000p-2, 0x1.70be7d6f6fa57p-27),
+    (double2)(0x1.d0a63a0000000p-2, 0x1.ce43cc84ab338p-27),
+    (double2)(0x1.d32fe70000000p-2, 0x1.c01d7aac3bd91p-27),
+    (double2)(0x1.d5b7f90000000p-2, 0x1.5c58d07961060p-27),
+    (double2)(0x1.d83e720000000p-2, 0x1.628bcf941456ep-28),
+    (double2)(0x1.dac3530000000p-2, 0x1.c58b2a8461cd2p-27),
+    (double2)(0x1.dd46a00000000p-2, 0x1.3071282fb989ap-28),
+    (double2)(0x1.dfc8590000000p-2, 0x1.20dab6a80f09cp-27),
+    (double2)(0x1.e248810000000p-2, 0x1.4f8d84c397b1ep-27),
+    (double2)(0x1.e4c71a0000000p-2, 0x1.0d0ee08599e48p-27),
+    (double2)(0x1.e744260000000p-2, 0x1.d68787e37da36p-30),
+    (double2)(0x1.e9bfa60000000p-2, 0x1.66187d591bafcp-28),
+    (double2)(0x1.ec399d0000000p-2, 0x1.2346600bae772p-29),
+    (double2)(0x1.eeb20c0000000p-2, 0x1.90377d0d61b8ep-28),
+    (double2)(0x1.f128f50000000p-2, 0x1.f5e0dd966b907p-27),
+    (double2)(0x1.f39e5b0000000p-2, 0x1.9023cb79a00e2p-27),
+    (double2)(0x1.f6123f0000000p-2, 0x1.4e05158c28ad8p-27),
+    (double2)(0x1.f884a30000000p-2, 0x1.bfa7b08b18ae4p-28),
+    (double2)(0x1.faf5880000000p-2, 0x1.ef1e63db35f67p-27),
+    (double2)(0x1.fd64f20000000p-2, 0x1.ec2ae39493d4fp-31),
+    (double2)(0x1.ffd2e00000000p-2, 0x1.0afe930ab2fa0p-27),
+    (double2)(0x1.011fab0000000p-1, 0x1.25ff8a1810dd4p-29),
+    (double2)(0x1.02552a0000000p-1, 0x1.69743fb1a71a5p-27),
+    (double2)(0x1.0389ee0000000p-1, 0x1.f9cc676785571p-26),
+    (double2)(0x1.04bdf90000000p-1, 0x1.b524da4cbf982p-26),
+    (double2)(0x1.05f14b0000000p-1, 0x1.a4c8b381535b8p-26),
+    (double2)(0x1.0723e50000000p-1, 0x1.839be809caf2cp-26),
+    (double2)(0x1.0855c80000000p-1, 0x1.0968a1cb82c13p-26),
+    (double2)(0x1.0986f40000000p-1, 0x1.eae6a41723fb5p-26),
+    (double2)(0x1.0ab76b0000000p-1, 0x1.d9c29a380a4dbp-26),
+    (double2)(0x1.0be72e0000000p-1, 0x1.094aa0ada625ep-27),
+    (double2)(0x1.0d163c0000000p-1, 0x1.973ad6fc108cap-26),
+    (double2)(0x1.0e44980000000p-1, 0x1.747322fdbab97p-27),
+    (double2)(0x1.0f72410000000p-1, 0x1.93692fa9d4221p-26),
+    (double2)(0x1.109f390000000p-1, 0x1.c5a992dfbc7d9p-26),
+    (double2)(0x1.11cb810000000p-1, 0x1.e1f33e102387ap-27),
+    (double2)(0x1.12f7190000000p-1, 0x1.64fbef14c048cp-27),
+    (double2)(0x1.1422020000000p-1, 0x1.490f513ca5e3bp-27),
+    (double2)(0x1.154c3d0000000p-1, 0x1.7a6af4d4c799dp-28),
+    (double2)(0x1.1675ca0000000p-1, 0x1.7574c1c07398fp-26),
+    (double2)(0x1.179eab0000000p-1, 0x1.7b133417f8c1cp-26),
+    (double2)(0x1.18c6e00000000p-1, 0x1.feb9e0c176514p-26),
+    (double2)(0x1.19ee6b0000000p-1, 0x1.19f25bb3172f7p-27),
+    (double2)(0x1.1b154b0000000p-1, 0x1.5f68a7bbfb852p-27),
+    (double2)(0x1.1c3b810000000p-1, 0x1.ee278497929f1p-26),
+    (double2)(0x1.1d610f0000000p-1, 0x1.ccee006109d58p-26),
+    (double2)(0x1.1e85f50000000p-1, 0x1.ce081a07bd8b3p-26),
+    (double2)(0x1.1faa340000000p-1, 0x1.70e12981817b8p-26),
+    (double2)(0x1.20cdcd0000000p-1, 0x1.92ab6d93503d0p-29),
+    (double2)(0x1.21f0bf0000000p-1, 0x1.8cb7dd7c3b61ep-26),
+    (double2)(0x1.23130d0000000p-1, 0x1.efafd0a0b78dap-27),
+    (double2)(0x1.2434b60000000p-1, 0x1.e907267c4288ep-26),
+    (double2)(0x1.2555bc0000000p-1, 0x1.d31ef96780875p-26),
+    (double2)(0x1.2676200000000p-1, 0x1.3430dfcd2ad50p-29),
+    (double2)(0x1.2795e10000000p-1, 0x1.44d88d75bc1f9p-28),
+    (double2)(0x1.28b5000000000p-1, 0x1.bec0f055e04fcp-26),
+    (double2)(0x1.29d37f0000000p-1, 0x1.d85611590b9adp-26),
+    (double2)(0x1.2af15f0000000p-1, 0x1.320568e583229p-32),
+    (double2)(0x1.2c0e9e0000000p-1, 0x1.a891d1772f538p-26),
+    (double2)(0x1.2d2b400000000p-1, 0x1.2edc9dabba74dp-29),
+    (double2)(0x1.2e47430000000p-1, 0x1.b9009a1015086p-27),
+    (double2)(0x1.2f62a90000000p-1, 0x1.2a12a8c5b1a19p-26),
+    (double2)(0x1.307d730000000p-1, 0x1.a7885f0fdac85p-28),
+    (double2)(0x1.3197a00000000p-1, 0x1.f4ffcd43ac691p-26),
+    (double2)(0x1.32b1330000000p-1, 0x1.2243ae2640aadp-26),
+    (double2)(0x1.33ca2b0000000p-1, 0x1.46513299035d3p-26),
+    (double2)(0x1.34e2890000000p-1, 0x1.b39c3a62dd725p-26),
+    (double2)(0x1.35fa4e0000000p-1, 0x1.ba6dd40049f51p-26),
+    (double2)(0x1.37117b0000000p-1, 0x1.51d1ed7177409p-27),
+    (double2)(0x1.38280f0000000p-1, 0x1.cb0f2fd7f5216p-26),
+    (double2)(0x1.393e0d0000000p-1, 0x1.ab150cd4e2213p-28),
+    (double2)(0x1.3a53730000000p-1, 0x1.cfd7bf3193844p-26),
+    (double2)(0x1.3b68440000000p-1, 0x1.3fff8455f1dbdp-26),
+    (double2)(0x1.3c7c7f0000000p-1, 0x1.fee640b905fc9p-26),
+    (double2)(0x1.3d90260000000p-1, 0x1.4e2adf548084cp-26),
+    (double2)(0x1.3ea3390000000p-1, 0x1.b597adc1ecdd2p-28),
+    (double2)(0x1.3fb5b80000000p-1, 0x1.345bd096d3a75p-27),
+    (double2)(0x1.40c7a40000000p-1, 0x1.101b9d2453c8bp-26),
+    (double2)(0x1.41d8fe0000000p-1, 0x1.08ce55cc8c979p-26),
+    (double2)(0x1.42e9c60000000p-1, 0x1.bbf017e595f71p-26),
+    (double2)(0x1.43f9fe0000000p-1, 0x1.7ce733bd393dcp-28),
+    (double2)(0x1.4509a50000000p-1, 0x1.33bb0a503f8a1p-29),
+    (double2)(0x1.4618bc0000000p-1, 0x1.0e2f613e85bd9p-28),
+    (double2)(0x1.4727430000000p-1, 0x1.e67555a635b3cp-26),
+    (double2)(0x1.48353d0000000p-1, 0x1.ea88df73d5e8bp-29),
+    (double2)(0x1.4942a80000000p-1, 0x1.d17e03bda18a8p-28),
+    (double2)(0x1.4a4f850000000p-1, 0x1.b607d76044f7ep-26),
+    (double2)(0x1.4b5bd60000000p-1, 0x1.2adc4e71bc2fcp-26),
+    (double2)(0x1.4c679a0000000p-1, 0x1.f99dc7362d1d9p-26),
+    (double2)(0x1.4d72d30000000p-1, 0x1.473fa008e6a6ap-26),
+    (double2)(0x1.4e7d810000000p-1, 0x1.b75bb09cb0985p-29),
+    (double2)(0x1.4f87a30000000p-1, 0x1.ea04dd10b9abap-26),
+    (double2)(0x1.50913c0000000p-1, 0x1.802d0d6979674p-26),
+    (double2)(0x1.519a4c0000000p-1, 0x1.74688ccd99094p-30),
+    (double2)(0x1.52a2d20000000p-1, 0x1.96f16abb9df22p-27),
+    (double2)(0x1.53aad00000000p-1, 0x1.6e66df2aa374fp-27),
+    (double2)(0x1.54b2460000000p-1, 0x1.e66525ea4550ap-27),
+    (double2)(0x1.55b9350000000p-1, 0x1.2d02f34f20cbdp-27),
+    (double2)(0x1.56bf9d0000000p-1, 0x1.6cfce65047188p-27),
+    (double2)(0x1.57c57f0000000p-1, 0x1.9b78c842d58b8p-28),
+    (double2)(0x1.58cadb0000000p-1, 0x1.735e624c24bc9p-27),
+    (double2)(0x1.59cfb20000000p-1, 0x1.7eba1f7dd1adfp-27),
+    (double2)(0x1.5ad4040000000p-1, 0x1.86b3e59f65355p-26),
+    (double2)(0x1.5bd7d30000000p-1, 0x1.ce38e637f1b4dp-30),
+    (double2)(0x1.5cdb1d0000000p-1, 0x1.8d82ec919edc7p-26),
+    (double2)(0x1.5ddde50000000p-1, 0x1.c52648ddcfa37p-27),
+    (double2)(0x1.5ee02a0000000p-1, 0x1.2482ceae1ac12p-26),
+    (double2)(0x1.5fe1ed0000000p-1, 0x1.5a312311aba4fp-26),
+    (double2)(0x1.60e32f0000000p-1, 0x1.11e236329f225p-27),
+    (double2)(0x1.61e3ef0000000p-1, 0x1.b48c8cd2f246cp-26),
+    (double2)(0x1.62e42e0000000p-1, 0x1.efa39ef35793cp-25),
+    (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0),
+)
+
+DECLARE_TABLE(double2, LOG_F_INV_TBL, 258,
+    (double2)(0x1.0000000000000p+1, 0x0.0000000000000p+0),
+    (double2)(0x1.fe00000000000p+0, 0x1.fe01fe01fe020p-16),
+    (double2)(0x1.fc00000000000p+0, 0x1.fc07f01fc07f0p-14),
+    (double2)(0x1.fa00000000000p+0, 0x1.1caa01fa11caap-12),
+    (double2)(0x1.f800000000000p+0, 0x1.f81f81f81f820p-12),
+    (double2)(0x1.f600000000000p+0, 0x1.8856506ddaba6p-11),
+    (double2)(0x1.f400000000000p+0, 0x1.196792909c560p-10),
+    (double2)(0x1.f200000000000p+0, 0x1.7d9108c2ad433p-10),
+    (double2)(0x1.f000000000000p+0, 0x1.f07c1f07c1f08p-10),
+    (double2)(0x1.ee00000000000p+0, 0x1.38ff08b1c03ddp-9),
+    (double2)(0x1.ec00000000000p+0, 0x1.80f6603d980f6p-9),
+    (double2)(0x1.ea00000000000p+0, 0x1.d00f57403d5d0p-9),
+    (double2)(0x1.e900000000000p+0, 0x1.31abf0b7672a0p-12),
+    (double2)(0x1.e700000000000p+0, 0x1.06a965d43919bp-10),
+    (double2)(0x1.e500000000000p+0, 0x1.ceb240795ceb2p-10),
+    (double2)(0x1.e300000000000p+0, 0x1.522f3b834e67fp-9),
+    (double2)(0x1.e100000000000p+0, 0x1.c3c3c3c3c3c3cp-9),
+    (double2)(0x1.e000000000000p+0, 0x1.e01e01e01e01ep-12),
+    (double2)(0x1.de00000000000p+0, 0x1.75b8fe21a291cp-10),
+    (double2)(0x1.dc00000000000p+0, 0x1.403b9403b9404p-9),
+    (double2)(0x1.da00000000000p+0, 0x1.cc0ed7303b5ccp-9),
+    (double2)(0x1.d900000000000p+0, 0x1.79118f3fc4da2p-11),
+    (double2)(0x1.d700000000000p+0, 0x1.ed952e0b0ce46p-10),
+    (double2)(0x1.d500000000000p+0, 0x1.95900eae56404p-9),
+    (double2)(0x1.d400000000000p+0, 0x1.d41d41d41d41dp-12),
+    (double2)(0x1.d200000000000p+0, 0x1.cb28ff16c69aep-10),
+    (double2)(0x1.d000000000000p+0, 0x1.96b1edd80e866p-9),
+    (double2)(0x1.cf00000000000p+0, 0x1.372e225fe30d9p-11),
+    (double2)(0x1.cd00000000000p+0, 0x1.0ad12073615a2p-9),
+    (double2)(0x1.cb00000000000p+0, 0x1.cdb2c0397cdb3p-9),
+    (double2)(0x1.ca00000000000p+0, 0x1.2cc157b864407p-10),
+    (double2)(0x1.c800000000000p+0, 0x1.64cb5f7148404p-9),
+    (double2)(0x1.c700000000000p+0, 0x1.c71c71c71c71cp-12),
+    (double2)(0x1.c500000000000p+0, 0x1.129a21a930b84p-9),
+    (double2)(0x1.c300000000000p+0, 0x1.f1e0387f1e038p-9),
+    (double2)(0x1.c200000000000p+0, 0x1.ad4e4ba80709bp-10),
+    (double2)(0x1.c000000000000p+0, 0x1.c0e070381c0e0p-9),
+    (double2)(0x1.bf00000000000p+0, 0x1.60fba1a362bb0p-10),
+    (double2)(0x1.bd00000000000p+0, 0x1.a5713280dee96p-9),
+    (double2)(0x1.bc00000000000p+0, 0x1.3f59620f9ece9p-10),
+    (double2)(0x1.ba00000000000p+0, 0x1.9f22983759f23p-9),
+    (double2)(0x1.b900000000000p+0, 0x1.478ac63fc8d5cp-10),
+    (double2)(0x1.b700000000000p+0, 0x1.ad87bb4671656p-9),
+    (double2)(0x1.b600000000000p+0, 0x1.78b8efbb8148cp-10),
+    (double2)(0x1.b400000000000p+0, 0x1.d0369d0369d03p-9),
+    (double2)(0x1.b300000000000p+0, 0x1.d212b601b3748p-10),
+    (double2)(0x1.b200000000000p+0, 0x1.b2036406c80d9p-15),
+    (double2)(0x1.b000000000000p+0, 0x1.29663b24547d1p-9),
+    (double2)(0x1.af00000000000p+0, 0x1.435e50d79435ep-11),
+    (double2)(0x1.ad00000000000p+0, 0x1.7d0ff2920bc03p-9),
+    (double2)(0x1.ac00000000000p+0, 0x1.5c06b15c06b16p-10),
+    (double2)(0x1.aa00000000000p+0, 0x1.e3a5f0fd7f954p-9),
+    (double2)(0x1.a900000000000p+0, 0x1.1dec0d4c77b03p-9),
+    (double2)(0x1.a800000000000p+0, 0x1.73289870ac52ep-11),
+    (double2)(0x1.a600000000000p+0, 0x1.a034da034da03p-9),
+    (double2)(0x1.a500000000000p+0, 0x1.d041da2292856p-10),
+    (double2)(0x1.a400000000000p+0, 0x1.a41a41a41a41ap-12),
+    (double2)(0x1.a200000000000p+0, 0x1.8550f8a39409dp-9),
+    (double2)(0x1.a100000000000p+0, 0x1.b4fe5e92c0686p-10),
+    (double2)(0x1.a000000000000p+0, 0x1.a01a01a01a01ap-12),
+    (double2)(0x1.9e00000000000p+0, 0x1.91d2a2067b23ap-9),
+    (double2)(0x1.9d00000000000p+0, 0x1.e7c5dada0b4e5p-10),
+    (double2)(0x1.9c00000000000p+0, 0x1.68a7725080ce1p-11),
+    (double2)(0x1.9a00000000000p+0, 0x1.c49d4aa21b490p-9),
+    (double2)(0x1.9900000000000p+0, 0x1.3333333333333p-9),
+    (double2)(0x1.9800000000000p+0, 0x1.4bc363b03fccfp-10),
+    (double2)(0x1.9700000000000p+0, 0x1.c9f01970e4f81p-13),
+    (double2)(0x1.9500000000000p+0, 0x1.97617c6ef5b25p-9),
+    (double2)(0x1.9400000000000p+0, 0x1.161f9add3c0cap-9),
+    (double2)(0x1.9300000000000p+0, 0x1.319fe6cb39806p-10),
+    (double2)(0x1.9200000000000p+0, 0x1.f693a1c451ab3p-13),
+    (double2)(0x1.9000000000000p+0, 0x1.a9e240321a9e2p-9),
+    (double2)(0x1.8f00000000000p+0, 0x1.3831f3831f383p-9),
+    (double2)(0x1.8e00000000000p+0, 0x1.949ebc4dcfc1cp-10),
+    (double2)(0x1.8d00000000000p+0, 0x1.80c6980c6980cp-11),
+    (double2)(0x1.8b00000000000p+0, 0x1.f9d00c5fe7403p-9),
+    (double2)(0x1.8a00000000000p+0, 0x1.9721ed7e75347p-9),
+    (double2)(0x1.8900000000000p+0, 0x1.381ec0313381fp-9),
+    (double2)(0x1.8800000000000p+0, 0x1.b97c2aec12653p-10),
+    (double2)(0x1.8700000000000p+0, 0x1.09ef3024ae3bap-10),
+    (double2)(0x1.8600000000000p+0, 0x1.8618618618618p-12),
+    (double2)(0x1.8400000000000p+0, 0x1.e0184f00c2780p-9),
+    (double2)(0x1.8300000000000p+0, 0x1.92ef5657dba52p-9),
+    (double2)(0x1.8200000000000p+0, 0x1.4940305494030p-9),
+    (double2)(0x1.8100000000000p+0, 0x1.0303030303030p-9),
+    (double2)(0x1.8000000000000p+0, 0x1.8060180601806p-10),
+    (double2)(0x1.7f00000000000p+0, 0x1.017f405fd017fp-10),
+    (double2)(0x1.7e00000000000p+0, 0x1.12a8ad278e8ddp-11),
+    (double2)(0x1.7d00000000000p+0, 0x1.7d05f417d05f4p-14),
+    (double2)(0x1.7b00000000000p+0, 0x1.d67245c02f7d6p-9),
+    (double2)(0x1.7a00000000000p+0, 0x1.a4411c1d986a9p-9),
+    (double2)(0x1.7900000000000p+0, 0x1.754d76c7316dfp-9),
+    (double2)(0x1.7800000000000p+0, 0x1.49902f149902fp-9),
+    (double2)(0x1.7700000000000p+0, 0x1.21023358c1a68p-9),
+    (double2)(0x1.7600000000000p+0, 0x1.f7390d2a6c406p-10),
+    (double2)(0x1.7500000000000p+0, 0x1.b2b0805d5b2b1p-10),
+    (double2)(0x1.7400000000000p+0, 0x1.745d1745d1746p-10),
+    (double2)(0x1.7300000000000p+0, 0x1.3c31507fa32c4p-10),
+    (double2)(0x1.7200000000000p+0, 0x1.0a1fd1b7af017p-10),
+    (double2)(0x1.7100000000000p+0, 0x1.bc36ce3e0453ap-11),
+    (double2)(0x1.7000000000000p+0, 0x1.702e05c0b8170p-11),
+    (double2)(0x1.6f00000000000p+0, 0x1.300b79300b793p-11),
+    (double2)(0x1.6e00000000000p+0, 0x1.f76b4337c6cb1p-12),
+    (double2)(0x1.6d00000000000p+0, 0x1.a62681c860fb0p-12),
+    (double2)(0x1.6c00000000000p+0, 0x1.6c16c16c16c17p-12),
+    (double2)(0x1.6b00000000000p+0, 0x1.490aa31a3cfc7p-12),
+    (double2)(0x1.6a00000000000p+0, 0x1.3cd153729043ep-12),
+    (double2)(0x1.6900000000000p+0, 0x1.473a88d0bfd2ep-12),
+    (double2)(0x1.6800000000000p+0, 0x1.6816816816817p-12),
+    (double2)(0x1.6700000000000p+0, 0x1.9f36016719f36p-12),
+    (double2)(0x1.6600000000000p+0, 0x1.ec6a5122f9016p-12),
+    (double2)(0x1.6500000000000p+0, 0x1.27c29da5519cfp-11),
+    (double2)(0x1.6400000000000p+0, 0x1.642c8590b2164p-11),
+    (double2)(0x1.6300000000000p+0, 0x1.ab5c45606f00bp-11),
+    (double2)(0x1.6200000000000p+0, 0x1.fd3b80b11fd3cp-11),
+    (double2)(0x1.6100000000000p+0, 0x1.2cda0c6ba4eaap-10),
+    (double2)(0x1.6000000000000p+0, 0x1.6058160581606p-10),
+    (double2)(0x1.5f00000000000p+0, 0x1.990d0a4b7ef87p-10),
+    (double2)(0x1.5e00000000000p+0, 0x1.d6ee340579d6fp-10),
+    (double2)(0x1.5d00000000000p+0, 0x1.0cf87d9c54a69p-9),
+    (double2)(0x1.5c00000000000p+0, 0x1.310572620ae4cp-9),
+    (double2)(0x1.5b00000000000p+0, 0x1.5798c8ff522a2p-9),
+    (double2)(0x1.5a00000000000p+0, 0x1.80ad602b580adp-9),
+    (double2)(0x1.5900000000000p+0, 0x1.ac3e24799546fp-9),
+    (double2)(0x1.5800000000000p+0, 0x1.da46102b1da46p-9),
+    (double2)(0x1.5800000000000p+0, 0x1.5805601580560p-14),
+    (double2)(0x1.5700000000000p+0, 0x1.ed3c506b39a23p-12),
+    (double2)(0x1.5600000000000p+0, 0x1.cbdd3e2970f60p-11),
+    (double2)(0x1.5500000000000p+0, 0x1.5555555555555p-10),
+    (double2)(0x1.5400000000000p+0, 0x1.c979aee0bf805p-10),
+    (double2)(0x1.5300000000000p+0, 0x1.21291e81fd58ep-9),
+    (double2)(0x1.5200000000000p+0, 0x1.5fead500a9580p-9),
+    (double2)(0x1.5100000000000p+0, 0x1.a0fd5c5f02a3ap-9),
+    (double2)(0x1.5000000000000p+0, 0x1.e45c223898adcp-9),
+    (double2)(0x1.5000000000000p+0, 0x1.5015015015015p-12),
+    (double2)(0x1.4f00000000000p+0, 0x1.c7b16ea64d422p-11),
+    (double2)(0x1.4e00000000000p+0, 0x1.7829cbc14e5e1p-10),
+    (double2)(0x1.4d00000000000p+0, 0x1.0877db8589720p-9),
+    (double2)(0x1.4c00000000000p+0, 0x1.5710e4b5edceap-9),
+    (double2)(0x1.4b00000000000p+0, 0x1.a7dbb4d1fc1c8p-9),
+    (double2)(0x1.4a00000000000p+0, 0x1.fad40a57eb503p-9),
+    (double2)(0x1.4a00000000000p+0, 0x1.3fd6bb00a5140p-11),
+    (double2)(0x1.4900000000000p+0, 0x1.4e78ecb419ba9p-10),
+    (double2)(0x1.4800000000000p+0, 0x1.00a44029100a4p-9),
+    (double2)(0x1.4700000000000p+0, 0x1.5c28f5c28f5c3p-9),
+    (double2)(0x1.4600000000000p+0, 0x1.b9c68b2c0cc4ap-9),
+    (double2)(0x1.4600000000000p+0, 0x1.978feb9f34381p-13),
+    (double2)(0x1.4500000000000p+0, 0x1.ecf163bb6500ap-11),
+    (double2)(0x1.4400000000000p+0, 0x1.be1958b67ebb9p-10),
+    (double2)(0x1.4300000000000p+0, 0x1.44e6157dc9a3bp-9),
+    (double2)(0x1.4200000000000p+0, 0x1.acc4baa3f0ddfp-9),
+    (double2)(0x1.4200000000000p+0, 0x1.6a4cbcb2a247bp-13),
+    (double2)(0x1.4100000000000p+0, 0x1.0505050505050p-10),
+    (double2)(0x1.4000000000000p+0, 0x1.e0b4439959819p-10),
+    (double2)(0x1.3f00000000000p+0, 0x1.6027f6027f602p-9),
+    (double2)(0x1.3e00000000000p+0, 0x1.d1e854b5e0db4p-9),
+    (double2)(0x1.3e00000000000p+0, 0x1.165e7254813e2p-11),
+    (double2)(0x1.3d00000000000p+0, 0x1.76646a9d716efp-10),
+    (double2)(0x1.3c00000000000p+0, 0x1.32b48f757ce88p-9),
+    (double2)(0x1.3b00000000000p+0, 0x1.ac1b24652a906p-9),
+    (double2)(0x1.3b00000000000p+0, 0x1.3b13b13b13b14p-12),
+    (double2)(0x1.3a00000000000p+0, 0x1.490e1eb208984p-10),
+    (double2)(0x1.3900000000000p+0, 0x1.2385830fec66ep-9),
+    (double2)(0x1.3800000000000p+0, 0x1.a45a6cc111b7ep-9),
+    (double2)(0x1.3800000000000p+0, 0x1.3813813813814p-12),
+    (double2)(0x1.3700000000000p+0, 0x1.56f472517b708p-10),
+    (double2)(0x1.3600000000000p+0, 0x1.31be7bc0e8f2ap-9),
+    (double2)(0x1.3500000000000p+0, 0x1.b9cbf3e55f044p-9),
+    (double2)(0x1.3500000000000p+0, 0x1.0e7d95bc609a9p-11),
+    (double2)(0x1.3400000000000p+0, 0x1.9e6b3804d19e7p-10),
+    (double2)(0x1.3300000000000p+0, 0x1.5c8b6af7963c2p-9),
+    (double2)(0x1.3200000000000p+0, 0x1.eb9dad43bf402p-9),
+    (double2)(0x1.3200000000000p+0, 0x1.f1a515885fb37p-11),
+    (double2)(0x1.3100000000000p+0, 0x1.0eeb1d3d76c02p-9),
+    (double2)(0x1.3000000000000p+0, 0x1.a320261a32026p-9),
+    (double2)(0x1.3000000000000p+0, 0x1.c82ac40260390p-12),
+    (double2)(0x1.2f00000000000p+0, 0x1.a12f684bda12fp-10),
+    (double2)(0x1.2e00000000000p+0, 0x1.69d43fda2962cp-9),
+    (double2)(0x1.2e00000000000p+0, 0x1.2e025c04b8097p-15),
+    (double2)(0x1.2d00000000000p+0, 0x1.42804b542804bp-10),
+    (double2)(0x1.2c00000000000p+0, 0x1.3f69b02593f6ap-9),
+    (double2)(0x1.2b00000000000p+0, 0x1.df31cb46e21fap-9),
+    (double2)(0x1.2b00000000000p+0, 0x1.012b404ad012bp-10),
+    (double2)(0x1.2a00000000000p+0, 0x1.23925e7820a7fp-9),
+    (double2)(0x1.2900000000000p+0, 0x1.c8253c8253c82p-9),
+    (double2)(0x1.2900000000000p+0, 0x1.b92ddc02526e5p-11),
+    (double2)(0x1.2800000000000p+0, 0x1.1602511602511p-9),
+    (double2)(0x1.2700000000000p+0, 0x1.bf471439c9adfp-9),
+    (double2)(0x1.2700000000000p+0, 0x1.a85c40939a85cp-11),
+    (double2)(0x1.2600000000000p+0, 0x1.166f9ac024d16p-9),
+    (double2)(0x1.2500000000000p+0, 0x1.c44e10125e227p-9),
+    (double2)(0x1.2500000000000p+0, 0x1.cebf48bbd90e5p-11),
+    (double2)(0x1.2400000000000p+0, 0x1.2492492492492p-9),
+    (double2)(0x1.2300000000000p+0, 0x1.d6f2e2ec0b673p-9),
+    (double2)(0x1.2300000000000p+0, 0x1.159e26af37c05p-10),
+    (double2)(0x1.2200000000000p+0, 0x1.4024540245402p-9),
+    (double2)(0x1.2100000000000p+0, 0x1.f6f0243f6f024p-9),
+    (double2)(0x1.2100000000000p+0, 0x1.5e60121579805p-10),
+    (double2)(0x1.2000000000000p+0, 0x1.68e18cf81b10fp-9),
+    (double2)(0x1.2000000000000p+0, 0x1.2012012012012p-12),
+    (double2)(0x1.1f00000000000p+0, 0x1.c11f7047dc11fp-10),
+    (double2)(0x1.1e00000000000p+0, 0x1.9e878ff70985ep-9),
+    (double2)(0x1.1e00000000000p+0, 0x1.779d9fdc3a219p-11),
+    (double2)(0x1.1d00000000000p+0, 0x1.1eace5c957907p-9),
+    (double2)(0x1.1c00000000000p+0, 0x1.e0d5b450239e1p-9),
+    (double2)(0x1.1c00000000000p+0, 0x1.48bf073816367p-10),
+    (double2)(0x1.1b00000000000p+0, 0x1.694808dda5202p-9),
+    (double2)(0x1.1b00000000000p+0, 0x1.7c67f2bae2b21p-12),
+    (double2)(0x1.1a00000000000p+0, 0x1.ee58469ee5847p-10),
+    (double2)(0x1.1900000000000p+0, 0x1.c0233c0233c02p-9),
+    (double2)(0x1.1900000000000p+0, 0x1.14e02328a7012p-10),
+    (double2)(0x1.1800000000000p+0, 0x1.561072057b573p-9),
+    (double2)(0x1.1800000000000p+0, 0x1.1811811811812p-12),
+    (double2)(0x1.1700000000000p+0, 0x1.e28646f5a1060p-10),
+    (double2)(0x1.1600000000000p+0, 0x1.c0d1284e6f1d7p-9),
+    (double2)(0x1.1600000000000p+0, 0x1.23543f0c80459p-10),
+    (double2)(0x1.1500000000000p+0, 0x1.63cbeea4e1a09p-9),
+    (double2)(0x1.1500000000000p+0, 0x1.b9a3fdd5c8cb8p-12),
+    (double2)(0x1.1400000000000p+0, 0x1.0be1c159a76d2p-9),
+    (double2)(0x1.1300000000000p+0, 0x1.e1d1a688e4838p-9),
+    (double2)(0x1.1300000000000p+0, 0x1.72044d72044d7p-10),
+    (double2)(0x1.1200000000000p+0, 0x1.91713db81577bp-9),
+    (double2)(0x1.1200000000000p+0, 0x1.ac73ae9819b50p-11),
+    (double2)(0x1.1100000000000p+0, 0x1.460334e904cf6p-9),
+    (double2)(0x1.1100000000000p+0, 0x1.1111111111111p-12),
+    (double2)(0x1.1000000000000p+0, 0x1.feef80441fef0p-10),
+    (double2)(0x1.0f00000000000p+0, 0x1.de021fde021fep-9),
+    (double2)(0x1.0f00000000000p+0, 0x1.7b7eacc9686a0p-10),
+    (double2)(0x1.0e00000000000p+0, 0x1.9ead7cd391fbcp-9),
+    (double2)(0x1.0e00000000000p+0, 0x1.0195609804390p-10),
+    (double2)(0x1.0d00000000000p+0, 0x1.641511e8d2b32p-9),
+    (double2)(0x1.0d00000000000p+0, 0x1.222b1acf1ce96p-11),
+    (double2)(0x1.0c00000000000p+0, 0x1.2e29f79b47582p-9),
+    (double2)(0x1.0c00000000000p+0, 0x1.4f0d1682e11cdp-13),
+    (double2)(0x1.0b00000000000p+0, 0x1.f9bb096771e4dp-10),
+    (double2)(0x1.0a00000000000p+0, 0x1.e5ee45dd96ae2p-9),
+    (double2)(0x1.0a00000000000p+0, 0x1.a0429a0429a04p-10),
+    (double2)(0x1.0900000000000p+0, 0x1.bb74d5f06c021p-9),
+    (double2)(0x1.0900000000000p+0, 0x1.4fce404254fcep-10),
+    (double2)(0x1.0800000000000p+0, 0x1.95766eacbc402p-9),
+    (double2)(0x1.0800000000000p+0, 0x1.0842108421084p-10),
+    (double2)(0x1.0700000000000p+0, 0x1.73e5371d5c338p-9),
+    (double2)(0x1.0700000000000p+0, 0x1.930523fbe3368p-11),
+    (double2)(0x1.0600000000000p+0, 0x1.56b38f225f6c4p-9),
+    (double2)(0x1.0600000000000p+0, 0x1.26e978d4fdf3bp-11),
+    (double2)(0x1.0500000000000p+0, 0x1.3dd40e4eb0cc6p-9),
+    (double2)(0x1.0500000000000p+0, 0x1.97f7d73404146p-12),
+    (double2)(0x1.0400000000000p+0, 0x1.293982cc98af1p-9),
+    (double2)(0x1.0400000000000p+0, 0x1.0410410410410p-12),
+    (double2)(0x1.0300000000000p+0, 0x1.18d6f048ff7e4p-9),
+    (double2)(0x1.0300000000000p+0, 0x1.236a3ebc349dep-13),
+    (double2)(0x1.0200000000000p+0, 0x1.0c9f8ee53d18cp-9),
+    (double2)(0x1.0200000000000p+0, 0x1.0204081020408p-14),
+    (double2)(0x1.0100000000000p+0, 0x1.0486ca2f46ea6p-9),
+    (double2)(0x1.0100000000000p+0, 0x1.0101010101010p-16),
+    (double2)(0x1.0000000000000p+0, 0x1.0080402010080p-9),
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+)
+

diff --git a/amd-builtins/math64/pownD.cl b/amd-builtins/math64/pownD.cl
new file mode 100644
index 0000000..83c2762
--- /dev/null
+++ b/amd-builtins/math64/pownD.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_POWN
+#include "powD_base.h"
+

diff --git a/amd-builtins/math64/powrD.cl b/amd-builtins/math64/powrD.cl
new file mode 100644
index 0000000..a02e929
--- /dev/null
+++ b/amd-builtins/math64/powrD.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_POWR
+#include "powD_base.h"
+

diff --git a/amd-builtins/math64/remainderD.cl b/amd-builtins/math64/remainderD.cl
new file mode 100644
index 0000000..a15873c
--- /dev/null
+++ b/amd-builtins/math64/remainderD.cl

@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+#define COMPILING_REMAINDER
+#include "remainderD.h"
+

diff --git a/amd-builtins/math64/remainderD.h b/amd-builtins/math64/remainderD.h
new file mode 100644
index 0000000..95cea6f
--- /dev/null
+++ b/amd-builtins/math64/remainderD.h

@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+static inline double
+my_ldexp(double x, int n)
+{
+    // XXX Have to go twice here because the hardware can't handle the full range (yet)
+    int nh = n >> 1;
+    return ldexp(ldexp(x, nh), n-nh);
+}
+
+#if defined(COMPILING_FMOD)
+__attribute__((overloadable, always_inline, weak)) double
+fmod(double x, double y)
+#elif defined(COMPILING_REMQUO)
+__attribute__((overloadable, always_inline, weak)) double
+remquo(double x, double y, int *pquo)
+#else
+__attribute__((overloadable, always_inline, weak)) double
+remainder(double x, double y)
+#endif
+{
+    ulong ux = as_ulong(x);
+    ulong ax = ux & ~SIGNBIT_DP64;
+    ulong xsgn = ux ^ ax;
+    double dx = as_double(ax);
+    int xexp = convert_int(ax >> EXPSHIFTBITS_DP64);
+    int xexp1 = 11 - (int) clz(ax & MANTBITS_DP64);
+    xexp1 = xexp < 1 ? xexp1 : xexp;
+
+    ulong uy = as_ulong(y);
+    ulong ay = uy & ~SIGNBIT_DP64;
+    double dy = as_double(ay);
+    int yexp = convert_int(ay >> EXPSHIFTBITS_DP64);
+    int yexp1 = 11 - (int) clz(ay & MANTBITS_DP64);
+    yexp1 = yexp < 1 ? yexp1 : yexp;
+
+#if !defined COMPILING_FMOD
+    int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
+#endif
+
+    // First assume |x| > |y|
+
+    // Set ntimes to the number of times we need to do a
+    // partial remainder. If the exponent of x is an exact multiple
+    // of 53 larger than the exponent of y, and the mantissa of x is
+    // less than the mantissa of y, ntimes will be one too large
+    // but it doesn't matter - it just means that we'll go round
+    // the loop below one extra time.
+    int ntimes = max(0, (xexp1 - yexp1) / 53);
+    double w =  my_ldexp(dy, ntimes * 53);
+    w = ntimes == 0 ? dy : w;
+    double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
+
+    // Each time round the loop we compute a partial remainder.
+    // This is done by subtracting a large multiple of w
+    // from x each time, where w is a scaled up version of y.
+    // The subtraction must be performed exactly in quad
+    // precision, though the result at each stage can
+    // fit exactly in a double precision number.
+    int i;
+    double t, v, p, pp;
+
+    for (i = 0; i < ntimes; i++) {
+        // Compute integral multiplier
+        t = trunc(dx / w);
+
+        // Compute w * t in quad precision
+        p = w * t;
+        pp = fma(w, t, -p);
+
+        // Subtract w * t from dx
+        v = dx - p;
+        dx = v + (((dx - v) - p) - pp);
+
+        // If t was one too large, dx will be negative. Add back one w.
+        dx += dx < 0.0 ? w : 0.0;
+
+        // Scale w down by 2^(-53) for the next iteration
+        w *= scale;
+    }
+
+    // One more time
+    // Variable todd says whether the integer t is odd or not
+    t = floor(dx / w);
+    long lt = (long)t;
+    int todd = lt & 1;
+
+    p = w * t;
+    pp = fma(w, t, -p);
+    v = dx - p;
+    dx = v + (((dx - v) - p) - pp);
+    i = dx < 0.0;
+    todd ^= i;
+    dx += i ? w : 0.0;
+
+#if defined(COMPILING_REMQUO)
+    lt -= i;
+#endif
+
+    // At this point, dx lies in the range [0,dy)
+
+#if !defined(COMPILING_FMOD)
+    // For the fmod function, we're done apart from setting the correct sign.
+    //
+    // For the remainder function, we need to adjust dx
+    // so that it lies in the range (-y/2, y/2] by carefully
+    // subtracting w (== dy == y) if necessary. The rigmarole
+    // with todd is to get the correct sign of the result
+    // when x/y lies exactly half way between two integers,
+    // when we need to choose the even integer.
+
+    int al = (2.0*dx > w) | (todd & (2.0*dx == w));
+    double dxl = dx - (al ? w : 0.0);
+
+    int ag = (dx > 0.5*w) | (todd & (dx == 0.5*w));
+    double dxg = dx - (ag ? w : 0.0);
+
+    dx = dy < 0x1.0p+1022 ? dxl : dxg;
+# if defined COMPILING_REMQUO
+    lt += dy < 0x1.0p+1022 ? al : ag;
+    int quo = ((int)lt & 0x7f) * qsgn;
+# endif
+#endif
+
+    double ret = as_double(xsgn ^ as_ulong(dx));
+    dx = as_double(ax);
+
+    // Now handle |x| == |y|
+    int c = dx == dy;
+    t = as_double(xsgn);
+#if defined COMPILING_REMQUO
+    quo = c ? qsgn : quo;
+#endif
+    ret = c ? t : ret;
+
+    // Next, handle |x| < |y|
+    c = dx < dy;
+#if defined COMPILING_REMQUO
+    quo = c ? 0 : quo;
+#endif
+    ret = c ? x : ret;
+
+#if !defined COMPILING_FMOD
+    c &= (yexp < 1023 & 2.0*dx > dy) | (dx > 0.5*dy);
+# if defined COMPILING_REMQUO
+    quo = c ? qsgn : quo;
+# endif
+    // we could use a conversion here instead since qsgn = +-1
+    p = qsgn == 1 ? -1.0 : 1.0;
+    t = fma(y, p, x);
+    ret = c ? t : ret;
+#endif
+
+    // We don't need anything special for |x| == 0
+
+    // |y| is 0
+    c = dy == 0.0;
+#if defined COMPILING_REMQUO
+    quo = c ? 0 : quo;
+#endif
+    ret = c ? as_double(QNANBITPATT_DP64) : ret;
+
+    // y is +-Inf, NaN
+    c = yexp > BIASEDEMAX_DP64;
+#if defined COMPILING_REMQUO
+    quo = c ? 0 : quo;
+#endif
+    t = y == y ? x : y;
+    ret = c ? t : ret;
+
+    // x is +=Inf, NaN
+    c = xexp > BIASEDEMAX_DP64;
+#if defined COMPILING_REMQUO
+    quo = c ? 0 : quo;
+#endif
+    ret = c ? as_double(QNANBITPATT_DP64) : ret;
+
+#if defined COMPILING_REMQUO
+    *pquo = quo;
+#endif
+    return ret;
+}
+

diff --git a/amd-builtins/math64/remainderD_piby2.h b/amd-builtins/math64/remainderD_piby2.h
new file mode 100644
index 0000000..ccf7937
--- /dev/null
+++ b/amd-builtins/math64/remainderD_piby2.h

@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// Reduction for medium sized arguments
+static inline void
+remainder_piby2_medium(double x, double *r, double *rr, int *regn)
+{
+    // How many pi/2 is x a multiple of?
+    const double two_by_pi = 0x1.45f306dc9c883p-1;
+    double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
+
+    const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
+    const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
+    const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
+
+    // Compute product of npi2 with 159 bits of 2/pi
+    double p_hh = piby2_h * dnpi2;
+    double p_ht = fma(piby2_h, dnpi2, -p_hh);
+    double p_mh = piby2_m * dnpi2;
+    double p_mt = fma(piby2_m, dnpi2, -p_mh);
+    double p_th = piby2_t * dnpi2;
+    double p_tt = fma(piby2_t, dnpi2, -p_th);
+
+    // Reduce to 159 bits
+    double ph = p_hh;
+    double pm = p_ht + p_mh;
+    double t = p_mh - (pm - p_ht);
+    double pt = p_th + t + p_mt + p_tt;
+    t = ph + pm; pm = pm - (t - ph); ph = t;
+    t = pm + pt; pt = pt - (t - pm); pm = t;
+
+    // Subtract from x
+    t = x + ph;
+    double qh = t + pm;
+    double qt = pm - (qh - t) + pt;
+
+    *r = qh;
+    *rr = qt;
+    *regn = (int)(long)dnpi2 & 0x3;
+}
+
+// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
+// extra precision, and return the result in r, rr.
+// Return value "regn" tells how many lots of pi/2 were subtracted
+// from x to put it in the range [-pi/4,pi/4], mod 4.
+
+// For bytealign
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+
+static inline void
+remainder_piby2_large(double x, double *r, double *rr, int *regn)
+{
+    USE_TABLE(uchar, pibits, PIBITS);
+
+    long ux = as_long(x);
+    int e = (int)(ux >> 52) -  1023;
+    int i = max(23, (e >> 3) + 17);
+    int j = 150 - i;
+    int j16 = j & ~0xf;
+
+    // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
+    uint4 q0 = *(__constant uint4 *)(pibits + j16);
+    uint4 q1 = *(__constant uint4 *)(pibits + j16 + 16);
+    uint4 q2 = *(__constant uint4 *)(pibits + j16 + 32);
+
+    int k = (j >> 2) & 0x3;
+    int4 c = (int4)k == (int4)(0, 1, 2, 3);
+
+    uint u0, u1, u2, u3, u4, u5, u6;
+
+    u0 = c.s1 ? q0.s1 : q0.s0;
+    u0 = c.s2 ? q0.s2 : u0;
+    u0 = c.s3 ? q0.s3 : u0;
+
+    u1 = c.s1 ? q0.s2 : q0.s1;
+    u1 = c.s2 ? q0.s3 : u1;
+    u1 = c.s3 ? q1.s0 : u1;
+
+    u2 = c.s1 ? q0.s3 : q0.s2;
+    u2 = c.s2 ? q1.s0 : u2;
+    u2 = c.s3 ? q1.s1 : u2;
+
+    u3 = c.s1 ? q1.s0 : q0.s3;
+    u3 = c.s2 ? q1.s1 : u3;
+    u3 = c.s3 ? q1.s2 : u3;
+
+    u4 = c.s1 ? q1.s1 : q1.s0;
+    u4 = c.s2 ? q1.s2 : u4;
+    u4 = c.s3 ? q1.s3 : u4;
+
+    u5 = c.s1 ? q1.s2 : q1.s1;
+    u5 = c.s2 ? q1.s3 : u5;
+    u5 = c.s3 ? q2.s0 : u5;
+
+    u6 = c.s1 ? q1.s3 : q1.s2;
+    u6 = c.s2 ? q2.s0 : u6;
+    u6 = c.s3 ? q2.s1 : u6;
+
+    uint v0 = amd_bytealign(u1, u0, j);
+    uint v1 = amd_bytealign(u2, u1, j);
+    uint v2 = amd_bytealign(u3, u2, j);
+    uint v3 = amd_bytealign(u4, u3, j);
+    uint v4 = amd_bytealign(u5, u4, j);
+    uint v5 = amd_bytealign(u6, u5, j);
+
+    // Place those 192 bits in 4 48-bit doubles along with correct exponent
+    // If i > 1018 we would get subnormals so we scale p up and x down to get the same product
+    i = 2 + 8*i;
+    x *= i > 1018 ? 0x1.0p-136 : 1.0;
+    i -= i > 1018 ? 136 : 0;
+
+    uint ua = (uint)(1023 + 52 - i) << 20;
+    double a = as_double((uint2)(0, ua));
+    double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
+    ua += 0x03000000U;
+    a = as_double((uint2)(0, ua));
+    double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
+    ua += 0x03000000U;
+    a = as_double((uint2)(0, ua));
+    double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
+    ua += 0x03000000U;
+    a = as_double((uint2)(0, ua));
+    double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
+
+    // Exact multiply
+    double f0h = p0 * x;
+    double f0l = fma(p0, x, -f0h);
+    double f1h = p1 * x;
+    double f1l = fma(p1, x, -f1h);
+    double f2h = p2 * x;
+    double f2l = fma(p2, x, -f2h);
+    double f3h = p3 * x;
+    double f3l = fma(p3, x, -f3h);
+
+    // Accumulate product into 4 doubles
+    double s, t;
+
+    double f3 = f3h + f2h;
+    t = f2h - (f3 - f3h);
+    s = f3l + t;
+    t = t - (s - f3l);
+
+    double f2 = s + f1h;
+    t = f1h - (f2 - s) + t;
+    s = f2l + t;
+    t = t - (s - f2l);
+
+    double f1 = s + f0h;
+    t = f0h - (f1 - s) + t;
+    s = f1l + t;
+
+    double f0 = s + f0l;
+
+    // Strip off unwanted large integer bits
+    f3 = 0x1.0p+10 * __amdil_fraction_f64(f3 * 0x1.0p-10);
+    f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
+
+#undef EXTRA_ACCURACY
+#if defined EXTRA_ACCURACY
+    // Shift out large integer bits.  This adds about 20 bits to the accuracy of "rr"
+    s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
+    s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
+    s = f1 + f0; t = f0 - (s - f1); f1 = s; f0 = t;
+#endif
+
+    // Compute least significant integer bits
+    t = f3 + f2;
+    double di = t - __amdil_fraction_f64(t);
+    i = (float)di;
+
+    // Shift out remaining integer part
+    f3 -= di;
+    s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
+    s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
+    f1 += f0;
+
+    // Subtract 1 if fraction is >= 0.5, and update regn
+    int g = f3 >= 0.5;
+    i += g;
+    f3 -= (float)g;
+
+    // Shift up bits
+    s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
+
+    // Multiply precise fraction by pi/2 to get radians
+    const double p2h = 7074237752028440.0 / 0x1.0p+52;
+    const double p2t = 4967757600021510.0 / 0x1.0p+106;
+
+    double rhi = f3 * p2h;
+    double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
+
+    *r = rhi + rlo;
+    *rr = rlo - (*r - rhi);
+    *regn = i & 0x3;
+}
+

diff --git a/amd-builtins/math64/remquoD.cl b/amd-builtins/math64/remquoD.cl
new file mode 100644
index 0000000..f296549
--- /dev/null
+++ b/amd-builtins/math64/remquoD.cl

@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+#define COMPILING_REMQUO
+#include "remainderD.h"
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline)) double
+remquo(double x, double y, __global int *quo)
+{
+    int q;
+    double r = remquo(x, y, &q);
+    *quo = q;
+    return r;
+}
+
+__attribute__((overloadable, always_inline)) double
+remquo(double x, double y, __local int *quo)
+{
+    int q;
+    double r = remquo(x, y, &q);
+    *quo = q;
+    return r;
+}
+#endif

diff --git a/amd-builtins/math64/rintD.cl b/amd-builtins/math64/rintD.cl
new file mode 100644
index 0000000..00e3ef6
--- /dev/null
+++ b/amd-builtins/math64/rintD.cl

@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__ ((overloadable, always_inline)) double
+rint(double x)
+{
+    return __amdil_round_nearest_f64(x);
+}
+

diff --git a/amd-builtins/math64/rootnD.cl b/amd-builtins/math64/rootnD.cl
new file mode 100644
index 0000000..06840dc
--- /dev/null
+++ b/amd-builtins/math64/rootnD.cl

@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define COMPILING_ROOTN
+#include "powD_base.h"
+

diff --git a/amd-builtins/math64/roundD.cl b/amd-builtins/math64/roundD.cl
new file mode 100644
index 0000000..2aa0827
--- /dev/null
+++ b/amd-builtins/math64/roundD.cl

@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__ ((overloadable, always_inline)) double
+round(double x)
+{
+    long l = as_long(x);
+    int e = ((int)(l >> 52) & 0x7ff) - 1023;
+    long s = (l & 0x8000000000000000L) | (e == -1 ? 0x3ff0000000000000L : 0L);
+    long m = 0x000fffffffffffffL >> e;
+    long d = 0x0008000000000000L >> e;
+    long k = l + (l & m ? d : 0);
+    k &= ~m;
+    k = e < 0 ? s : k;
+    k = e > 51 ? l : k;
+    return as_double(k);
+}
+

diff --git a/amd-builtins/math64/rsqrtD.cl b/amd-builtins/math64/rsqrtD.cl
new file mode 100644
index 0000000..3f6f346
--- /dev/null
+++ b/amd-builtins/math64/rsqrtD.cl

@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable, always_inline)) double
+rsqrt(double x)
+{
+    double y0 = __amdil_rsq_f64(x);
+    double y1 = 0.5 * y0 * fma(-x*y0, y0, 3.0);
+    double y2 = 0.5 * y1 * fma(-x*y1, y1, 3.0);
+    return y0 > 0.0 & y0 <= 0x1.fffffffffffffp+1023 ? y2 : y0;
+}
+

diff --git a/amd-builtins/math64/sinD.cl b/amd-builtins/math64/sinD.cl
new file mode 100644
index 0000000..e4705de
--- /dev/null
+++ b/amd-builtins/math64/sinD.cl

@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+#include "sincosD_piby4.h"
+#include "remainderD_piby2.h"
+
+__attribute__((overloadable, always_inline, pure, weak)) double
+sin(double x)
+{
+    double y = fabs(x);
+
+    double r, rr;
+    int regn;
+
+    if (y < 0x1.0p+47)
+        remainder_piby2_medium(y, &r, &rr, &regn);
+    else 
+        remainder_piby2_large(y, &r, &rr, &regn);
+
+    double2 sc = sincos_piby4(r, rr);
+
+    int2 s = as_int2(regn & 1 ? sc.hi : sc.lo);
+    s.hi ^= ((regn > 1) << 31) ^ ((x < 0.0) << 31);
+
+    return  isinf(x) | isnan(x) ? as_double(QNANBITPATT_DP64) : as_double(s);
+}
+

diff --git a/amd-builtins/math64/sincosD.cl b/amd-builtins/math64/sincosD.cl
new file mode 100644
index 0000000..daf4102
--- /dev/null
+++ b/amd-builtins/math64/sincosD.cl

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+#include "sincosD_piby4.h"
+#include "remainderD_piby2.h"
+
+__attribute__((overloadable, always_inline)) double
+sincos(double x, double * cp)
+{
+    double y = fabs(x);
+
+    double r, rr;
+    int regn;
+
+    if (y < 0x1.0p+47)
+        remainder_piby2_medium(y, &r, &rr, &regn);
+    else 
+        remainder_piby2_large(y, &r, &rr, &regn);
+
+    double2 sc = sincos_piby4(r, rr);
+
+    int flip = (regn > 1) << 31;
+    int2 s = as_int2(regn & 1 ?  sc.hi : sc.lo);
+    s.hi ^= flip ^ ((x < 0.0) << 31);
+    sc.lo = -sc.lo;
+    int2 c = as_int2(regn & 1 ? sc.lo : sc.hi);
+    c.hi ^= flip;
+
+    int xgeinf = isnan(x) | isinf(x);
+    s = xgeinf ? as_int2(QNANBITPATT_DP64) : s;
+    c = xgeinf ? as_int2(QNANBITPATT_DP64) : c;
+
+    *cp = as_double(c);
+    return as_double(s);
+}
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline)) double
+sincos(double x, __global double * cp)
+{
+    double c;
+    double s = sincos(x, &c);
+    *cp = c;
+    return s;
+}
+
+__attribute__((overloadable, always_inline)) double
+sincos(double x, __local double * cp)
+{
+    double c;
+    double s = sincos(x, &c);
+    *cp = c;
+    return s;
+}
+#endif

diff --git a/amd-builtins/math64/sincosD_piby4.h b/amd-builtins/math64/sincosD_piby4.h
new file mode 100644
index 0000000..384a5f7
--- /dev/null
+++ b/amd-builtins/math64/sincosD_piby4.h

@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+static inline double2
+sincos_piby4(double x, double xx)
+{
+    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+    //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+    //                      = x * f(w)
+    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+    // We use a minimax approximation of (f(w) - 1) / w
+    // because this produces an expansion in even powers of x.
+    // If xx (the tail of x) is non-zero, we add a correction
+    // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+    // is an approximation to cos(x)*sin(xx) valid because
+    // xx is tiny relative to x.
+
+    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+    //                      = f(w)
+    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+    // because this produces an expansion in even powers of x.
+    // If xx (the tail of x) is non-zero, we subtract a correction
+    // term g(x,xx) = x*xx to the result, where g(x,xx)
+    // is an approximation to sin(x)*sin(xx) valid because
+    // xx is tiny relative to x.
+
+    const double sc1 = -0.166666666666666646259241729;
+    const double sc2 =  0.833333333333095043065222816e-2;
+    const double sc3 = -0.19841269836761125688538679e-3;
+    const double sc4 =  0.275573161037288022676895908448e-5;
+    const double sc5 = -0.25051132068021699772257377197e-7;
+    const double sc6 =  0.159181443044859136852668200e-9;
+
+    const double cc1 =  0.41666666666666665390037e-1;
+    const double cc2 = -0.13888888888887398280412e-2;
+    const double cc3 =  0.248015872987670414957399e-4;
+    const double cc4 = -0.275573172723441909470836e-6;
+    const double cc5 =  0.208761463822329611076335e-8;
+    const double cc6 = -0.113826398067944859590880e-10;
+
+    double x2 = x * x;
+    double x3 = x2 * x;
+    double r = 0.5 * x2;
+    double t = 1.0 - r;
+
+    double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+    double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
+                        x2*x2, fma(x, xx, (1.0 - t) - r));
+
+    double2 ret;
+    ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
+    ret.hi = cp;
+
+    return ret;
+}
+

diff --git a/amd-builtins/math64/sinhD.cl b/amd-builtins/math64/sinhD.cl
new file mode 100644
index 0000000..11098fc
--- /dev/null
+++ b/amd-builtins/math64/sinhD.cl

@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+sinh(double x)
+{
+    USE_TABLE(double2, sinh_tbl, SINH_TBL);
+    USE_TABLE(double2, cosh_tbl, COSH_TBL);
+
+    // After dealing with special cases the computation is split into
+    // regions as follows:
+    //
+    // abs(x) >= max_sinh_arg:
+    // sinh(x) = sign(x)*Inf
+    //
+    // abs(x) >= small_threshold:
+    // sinh(x) = sign(x)*exp(abs(x))/2 computed using the
+    // splitexp and scaleDouble functions as for exp_amd().
+    //
+    // abs(x) < small_threshold:
+    // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    // sinh(x) is then sign(x)*z.
+
+    const double max_sinh_arg = 7.10475860073943977113e+02; // 0x408633ce8fb9f87e
+
+    // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27)
+    const double small_threshold = 0x1.2b708872320e2p+4;
+
+    double y = fabs(x);
+
+    // In this range we find the integer part y0 of y
+    // and the increment dy = y - y0. We then compute
+    // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
+    // where sinh(y0) and cosh(y0) are obtained from tables
+
+    int ind = min((int)y, 36);
+    double dy = y - ind;
+    double dy2 = dy * dy;
+
+    double sdy = dy * dy2 *
+	         fma(dy2,
+		     fma(dy2,
+			 fma(dy2,
+			     fma(dy2,
+				 fma(dy2,
+				     fma(dy2, 0.7746188980094184251527126e-12, 0.160576793121939886190847e-9),
+				     0.250521176994133472333666e-7),
+				 0.275573191913636406057211e-5),
+			     0.198412698413242405162014e-3),
+			 0.833333333333329931873097e-2),
+		     0.166666666666666667013899e0);
+
+    double cdy = dy2 * fma(dy2,
+	                   fma(dy2,
+			       fma(dy2,
+				   fma(dy2,
+				       fma(dy2,
+					   fma(dy2, 0.1163921388172173692062032e-10, 0.208744349831471353536305e-8),
+					   0.275573350756016588011357e-6),
+				       0.248015872460622433115785e-4),
+				   0.138888888889814854814536e-2),
+			       0.416666666666660876512776e-1),
+			   0.500000000000000005911074e0);
+
+    // At this point sinh(dy) is approximated by dy + sdy.
+    // Shift some significant bits from dy to sdy.
+    double sdy1 = as_double(as_ulong(dy) & 0xfffffffff8000000UL);
+    double sdy2 = sdy + (dy - sdy1);
+
+    double2 tv = cosh_tbl[ind];
+    double cl = tv.s0;
+    double ct = tv.s1;
+    tv = sinh_tbl[ind];
+    double sl = tv.s0;
+    double st = tv.s1;
+
+    double z = fma(cl, sdy1, fma(sl, cdy, fma(cl, sdy2, fma(ct, sdy1, fma(st, cdy, ct*sdy2)) + st))) + sl;
+
+    // Other cases
+    z = y < 0x1.0p-28 | isnan(x) | isinf(x) ? y : z;
+
+    double t = exp(y - 0x1.62e42fefa3800p-1);
+    t = fma(t, -0x1.ef35793c76641p-45, t);
+    z = y >= small_threshold ? t : z;
+    z = y >= max_sinh_arg ? as_double(PINFBITPATT_DP64) : z;
+
+    return copysign(z, x);
+}
+

diff --git a/amd-builtins/math64/sinhcoshD_table.h b/amd-builtins/math64/sinhcoshD_table.h
new file mode 100644
index 0000000..1db0e89
--- /dev/null
+++ b/amd-builtins/math64/sinhcoshD_table.h

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// Lead and tail tabulated values of sinh(i) and cosh(i) 
+// for i = 0,...,36. The lead part has 26 leading bits.
+
+DECLARE_TABLE(double2, SINH_TBL, 37,
+    (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.2cd9fc0000000p+0, 0x1.13ae6096a0092p-26),
+    (double2)(0x1.d03cf60000000p+1, 0x1.db70cfb79a640p-26),
+    (double2)(0x1.40926e0000000p+3, 0x1.c2526b66dc067p-23),
+    (double2)(0x1.b4a3800000000p+4, 0x1.b81b18647f380p-23),
+    (double2)(0x1.28d0160000000p+6, 0x1.bc1cdd1e1eb08p-20),
+    (double2)(0x1.936d228000000p+7, 0x1.d9f201534fb09p-19),
+    (double2)(0x1.1228768000000p+9, 0x1.d1c064a4e9954p-18),
+    (double2)(0x1.749ea50000000p+10, 0x1.4eca65d06ea74p-18),
+    (double2)(0x1.fa71570000000p+11, 0x1.0c259bcc0ecc5p-15),
+    (double2)(0x1.5829dc8000000p+13, 0x1.b5a6647cf9016p-13),
+    (double2)(0x1.d3c4488000000p+14, 0x1.9691adefb0870p-15),
+    (double2)(0x1.3de1650000000p+16, 0x1.3410fc29cde38p-10),
+    (double2)(0x1.b00b590000000p+17, 0x1.6a31a50b6fb3cp-11),
+    (double2)(0x1.259ac48000000p+19, 0x1.7defc71805c40p-10),
+    (double2)(0x1.8f0cca8000000p+20, 0x1.eb49fd80e0babp-6),
+    (double2)(0x1.0f2ebd0000000p+22, 0x1.4fffc7bcd5920p-7),
+    (double2)(0x1.7093488000000p+23, 0x1.03a93b6c63435p-3),
+    (double2)(0x1.f4f2208000000p+24, 0x1.1940bb255fd1cp-4),
+    (double2)(0x1.546d8f8000000p+26, 0x1.ed26e14260b50p-2),
+    (double2)(0x1.ceb0888000000p+27, 0x1.b47401fc9f2a2p+0),
+    (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55634f1p+3),
+    (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff8194ddcp+2),
+    (double2)(0x1.226af30000000p+32, 0x1.d8fee052ba63ap+5),
+    (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde3f6p+7),
+    (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557d1ap+8),
+    (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9dc4p+8),
+    (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abap+11),
+    (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13),
+    (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6363p+14),
+    (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15),
+    (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17),
+    (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19),
+    (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19),
+    (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22),
+    (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21),
+    (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22),
+)
+
+DECLARE_TABLE(double2, COSH_TBL, 37,
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.8b07550000000p+0, 0x1.d9f5504c2bd28p-28),
+    (double2)(0x1.e18fa08000000p+1, 0x1.7cb66f0a4c9fdp-25),
+    (double2)(0x1.422a490000000p+3, 0x1.f58617928e588p-23),
+    (double2)(0x1.b4ee858000000p+4, 0x1.bc7d000c38d48p-25),
+    (double2)(0x1.28d6fc8000000p+6, 0x1.f7f9d4e329998p-21),
+    (double2)(0x1.936e678000000p+7, 0x1.6e6e464885269p-19),
+    (double2)(0x1.1228948000000p+9, 0x1.ba3a8b946c154p-19),
+    (double2)(0x1.749eaa8000000p+10, 0x1.3f4e76110d5a4p-18),
+    (double2)(0x1.fa71580000000p+11, 0x1.17622515a3e2bp-15),
+    (double2)(0x1.5829dd0000000p+13, 0x1.4dc4b528af3d0p-17),
+    (double2)(0x1.d3c4488000000p+14, 0x1.1156278615e10p-14),
+    (double2)(0x1.3de1650000000p+16, 0x1.35ad50ed821f5p-10),
+    (double2)(0x1.b00b590000000p+17, 0x1.6b61055f2935cp-11),
+    (double2)(0x1.259ac48000000p+19, 0x1.7e2794a601240p-10),
+    (double2)(0x1.8f0cca8000000p+20, 0x1.eb4b45f6aadd3p-6),
+    (double2)(0x1.0f2ebd0000000p+22, 0x1.5000b967b3698p-7),
+    (double2)(0x1.7093488000000p+23, 0x1.03a940fadc092p-3),
+    (double2)(0x1.f4f2208000000p+24, 0x1.1940bf3bf874cp-4),
+    (double2)(0x1.546d8f8000000p+26, 0x1.ed26e1a2a2110p-2),
+    (double2)(0x1.ceb0888000000p+27, 0x1.b4740205796d6p+0),
+    (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55cb85dp+3),
+    (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff81e18acp+2),
+    (double2)(0x1.226af30000000p+32, 0x1.d8fee052bdea4p+5),
+    (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde926p+7),
+    (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557e0ep+8),
+    (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9e1cp+8),
+    (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abep+11),
+    (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13),
+    (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6364p+14),
+    (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15),
+    (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17),
+    (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19),
+    (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19),
+    (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22),
+    (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21),
+    (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22),
+)
+

diff --git a/amd-builtins/math64/sinpiD.cl b/amd-builtins/math64/sinpiD.cl
new file mode 100644
index 0000000..a935a9c
--- /dev/null
+++ b/amd-builtins/math64/sinpiD.cl

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+#include "sincosD_piby4.h"
+
+__attribute__((overloadable)) double
+sinpi(double x)
+{
+    const double pi = 3.1415926535897932384626433832795;
+
+    long ix = as_long(x); 
+    long xsgn = ix & 0x8000000000000000L;
+    ix ^= xsgn;
+    double ax = as_double(ix);
+    long iax = (long)ax;
+    double r = ax - (double)iax;
+    long xodd = xsgn ^ (iax & 0x1L ? 0x8000000000000000L : 0L);
+
+    // Initialize with return for +-Inf and NaN
+    long ir = 0x7ff8000000000000L;
+
+    // 2^23 <= |x| < Inf, the result is always integer
+    ir = ix < 0x7ff0000000000000 ? xsgn : ir;
+
+    // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+    // r < 1.0
+    double a = 1.0 - r;
+    int e = 0;
+
+    //  r <= 0.75
+    int c = r <= 0.75;
+    double t = r - 0.5;
+    a = c ? t : a;
+    e = c ? 1 : e;
+
+    // r < 0.5
+    c = r < 0.5;
+    t = 0.5 - r;
+    a = c ? t : a;
+
+    // r <= 0.25
+    c = r <= 0.25;
+    a = c ? r : a;
+    e = c ? 0 : e;
+
+    double api = a * pi;
+    double2 sc = sincos_piby4(api, 0.0);
+    long jr = xodd ^ as_long(e ? sc.hi : sc.lo);
+
+    ir = ax < 0x1.0p+52 ? jr : ir;
+
+    return as_double(ir);
+}
+

diff --git a/amd-builtins/math64/sqrtD.cl b/amd-builtins/math64/sqrtD.cl
new file mode 100644
index 0000000..9d7efb5
--- /dev/null
+++ b/amd-builtins/math64/sqrtD.cl

@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+extern __attribute__((pure)) double __hsail_sqrt_f64(double);
+
+__attribute__((overloadable, always_inline, weak)) double
+sqrt(double x)
+{
+    /* Use sqrt_f64 because our nsqrt_f64 does not have necessary precision. */
+    return __hsail_sqrt_f64(x);
+}

diff --git a/amd-builtins/math64/tables64.cl b/amd-builtins/math64/tables64.cl
new file mode 100644
index 0000000..4cafec2
--- /dev/null
+++ b/amd-builtins/math64/tables64.cl

@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+#include "pibits64.h"
+
+#include "expD_table.h"
+
+#include "cbrtD_table.h"
+
+#include "logD_table.h"
+
+#include "powD_table.h"
+
+#include "sinhcoshD_table.h"
+
+#include "atan2D_table.h"
+

diff --git a/amd-builtins/math64/tanD.cl b/amd-builtins/math64/tanD.cl
new file mode 100644
index 0000000..59c0742
--- /dev/null
+++ b/amd-builtins/math64/tanD.cl

@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+#include "tanD_piby4.h"
+#include "remainderD_piby2.h"
+
+__attribute__((overloadable)) double
+tan(double x)
+{
+    double y = fabs(x);
+
+    double r, rr;
+    int regn;
+
+    if (y < 0x1.0p+30)
+        remainder_piby2_medium(y, &r, &rr, &regn);
+    else 
+        remainder_piby2_large(y, &r, &rr, &regn);
+
+    double2 tt = tan_piby4(r, rr);
+
+    int2 t = as_int2(regn & 1 ? tt.y : tt.x);
+    t.hi ^= (x < 0.0) << 31;
+
+    return __amdil_class_f64(x, SNAN|QNAN|PINF|NINF) ? as_double(QNANBITPATT_DP64) : as_double(t);
+}
+

diff --git a/amd-builtins/math64/tanD_piby4.h b/amd-builtins/math64/tanD_piby4.h
new file mode 100644
index 0000000..011b165
--- /dev/null
+++ b/amd-builtins/math64/tanD_piby4.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// tan(x + xx) approximation valid on the interval [-pi/4,pi/4].
+// Also return -1/tan(x + xx) in .y
+static inline double2
+tan_piby4(double x, double xx)
+{
+    const double piby4_lead = 7.85398163397448278999e-01; // 0x3fe921fb54442d18
+    const double piby4_tail = 3.06161699786838240164e-17; // 0x3c81a62633145c06
+
+    // In order to maintain relative precision transform using the identity:
+    // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4.
+    // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4.
+
+    int ca = x >  0.68;
+    int cb = x < -0.68;
+    double transform = ca ?  1.0 : 0.0;
+    transform = cb ? -1.0 : transform;
+
+    double tx = fma(-transform, x, piby4_lead) + fma(-transform, xx, piby4_tail);
+    int c = ca | cb;
+    x = c ? tx : x;
+    xx = c ? 0.0 : xx;
+
+    // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
+    double t1 = x;
+    double r = fma(2.0, x*xx, x*x);
+
+    double a = fma(r,
+                   fma(r, 0.224044448537022097264602535574e-3, -0.229345080057565662883358588111e-1),
+                   0.372379159759792203640806338901e0);
+
+    double b = fma(r,
+                   fma(r,
+                       fma(r, -0.232371494088563558304549252913e-3, 0.260656620398645407524064091208e-1),
+                       -0.515658515729031149329237816945e0),
+                   0.111713747927937668539901657944e1);
+
+    double t2 = fma(MATH_DIVIDE(a, b), x*r, xx);
+
+    double tp = t1 + t2;
+
+    // Compute -1.0/(t1 + t2) accurately
+    double z1 = as_double(as_long(tp) & 0xffffffff00000000L);
+    double z2 = t2 - (z1 - t1);
+    double trec = -MATH_RECIP(tp);
+    double trec_top = as_double(as_long(trec) & 0xffffffff00000000L);
+
+    double tpr = fma(fma(trec_top, z2, fma(trec_top, z1, 1.0)), trec, trec_top);
+
+    double tpt = transform * (1.0 - MATH_DIVIDE(2.0*tp, 1.0 + tp));
+    double tptr = transform * (MATH_DIVIDE(2.0*tp, tp - 1.0) - 1.0);
+
+    double2 ret;
+    ret.lo = c ? tpt : tp;
+    ret.hi = c ? tptr : tpr;
+    return ret;
+}
+

diff --git a/amd-builtins/math64/tanhD.cl b/amd-builtins/math64/tanhD.cl
new file mode 100644
index 0000000..91dc1a8
--- /dev/null
+++ b/amd-builtins/math64/tanhD.cl

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+tanh(double x)
+{
+    // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+    // to the following three formulae:
+    // 1.  (exp(x) - exp(-x))/(exp(x) + exp(-x))
+    // 2.  (1 - (2/(exp(2*x) + 1 )))
+    // 3.  (exp(2*x) - 1)/(exp(2*x) + 1)
+    // but computationally, some formulae are better on some ranges.
+
+    // The point at which e^-x is insignificant compared to e^x = ln(2^27)
+    const double large_threshold = 0x1.2b708872320e2p+4;
+
+    ulong ux = as_ulong(x);
+    ulong ax = ux & ~SIGNBIT_DP64;
+    ulong sx = ux ^ ax;
+    double y = as_double(ax);
+    double y2 = y * y;
+
+    // y < 0.9
+    double znl = fma(y2,
+	             fma(y2,
+			 fma(y2, -0.142077926378834722618091e-7, -0.200047621071909498730453e-3),
+			 -0.176016349003044679402273e-1),
+		     -0.274030424656179760118928e0);
+
+    double zdl = fma(y2,
+	             fma(y2,
+			 fma(y2, 0.2091140262529164482568557e-3, 0.201562166026937652780575e-1),
+			 0.381641414288328849317962e0),
+		     0.822091273968539282568011e0);
+
+    // 0.9 <= y <= 1
+    double znm = fma(y2,
+	             fma(y2,
+			 fma(y2, -0.115475878996143396378318e-7, -0.165597043903549960486816e-3),
+			 -0.146173047288731678404066e-1),
+		     -0.227793870659088295252442e0);
+
+    double zdm = fma(y2,
+	             fma(y2,
+			 fma(y2, 0.173076050126225961768710e-3, 0.167358775461896562588695e-1),
+			 0.317204558977294374244770e0),
+		     0.683381611977295894959554e0);
+
+    int c = y < 0.9;
+    double zn = c ? znl : znm;
+    double zd = c ? zdl : zdm;
+    double z = y + y*y2 * MATH_DIVIDE(zn, zd);
+
+    // y > 1
+    double p = exp(2.0 * y) + 1.0;
+    double zg = 1.0 - 2.0 / p;
+
+    z = y > 1.0 ? zg : z;
+
+    // Other cases
+    z = y < 0x1.0p-28 | ax > PINFBITPATT_DP64 ? x : z;
+
+    z = y > large_threshold ? 1.0 : z;
+
+    return as_double(sx | as_ulong(z));
+}
+

diff --git a/amd-builtins/math64/tanpiD.cl b/amd-builtins/math64/tanpiD.cl
new file mode 100644
index 0000000..d2e12a2
--- /dev/null
+++ b/amd-builtins/math64/tanpiD.cl

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+#include "tanD_piby4.h"
+
+__attribute__((overloadable)) double
+tanpi(double x)
+{
+    const double pi = 3.1415926535897932384626433832795;
+
+    long ix = as_long(x); 
+    long xsgn = ix & 0x8000000000000000L;
+    long xnsgn = xsgn ^ 0x8000000000000000L;
+    ix ^= xsgn;
+    double ax = as_double(ix);
+    long iax = (long)ax;
+    double r = ax - iax;
+    long xodd = xsgn ^ (iax & 0x1 ? 0x8000000000000000L : 0L);
+
+    // Initialize with return for +-Inf and NaN
+    long ir = 0x7ff8000000000000L;
+
+    // 2^53 <= |x| < Inf, the result is always even integer
+    ir = ix < 0x7ff0000000000000L ? xsgn : ir;
+
+    // 2^52 <= |x| < 2^53, the result is always integer
+    ir = ix < 0x4340000000000000L ? xodd : ir;
+
+    // 0x1.0p-14 <= |x| < 2^53, result depends on which 0.25 interval
+
+    // r < 1.0
+    double a = 1.0 - r;
+    int e = 0;
+    long s = xnsgn;
+
+    // r <= 0.75
+    int c = r <= 0.75;
+    double t = r - 0.5;
+    a = c ? t : a;
+    e = c ? 1 : e;
+    s = c ? xsgn : s;
+
+    // r < 0.5
+    c = r < 0.5;
+    t = 0.5 - r;
+    a = c ? t : a;
+    s = c ? xnsgn : s;
+
+    // r <= 0.25
+    c = r <= 0.25;
+    a = c ? r : a;
+    e = c ? 0 : e;
+    s = c ? xsgn : s;
+
+    double api = a * pi;
+    double2 tt = tan_piby4(api, 0.0);
+    long jr = s ^ as_long(e ? tt.hi : tt.lo);
+
+    long si = xodd | 0x7ff0000000000000L;
+    jr = r == 0.5 ? si : jr;
+
+    ir = ix < 0x4330000000000000L ? jr : ir;
+
+    return as_double(ir);
+}
+

diff --git a/amd-builtins/math64/tgammaD.cl b/amd-builtins/math64/tgammaD.cl
new file mode 100644
index 0000000..020cb66
--- /dev/null
+++ b/amd-builtins/math64/tgammaD.cl

@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__((overloadable)) double
+tgamma(double x)
+{
+    const double pi = 3.1415926535897932384626433832795;
+    double ax = fabs(x);
+    double lg = lgamma(ax);
+    double g = exp(lg);
+
+    if (x < 0.0)
+    {
+		double z = sinpi(x);
+		g = g * ax * z;
+        g = pi / g;
+        g = g == 0 ? as_double(PINFBITPATT_DP64) : g;
+        g = z == 0 ? as_double(QNANBITPATT_DP64) : g;
+	}
+
+    return g;
+}
+

diff --git a/amd-builtins/math64/truncD.cl b/amd-builtins/math64/truncD.cl
new file mode 100644
index 0000000..1be69cb
--- /dev/null
+++ b/amd-builtins/math64/truncD.cl

@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math64.h"
+
+__attribute__ ((overloadable, always_inline)) double
+trunc(double x)
+{
+    return __amdil_round_zero_f64(x);
+}
+

diff --git a/amd-builtins/math64/vexpandD.cl b/amd-builtins/math64/vexpandD.cl
new file mode 100644
index 0000000..4a797f6
--- /dev/null
+++ b/amd-builtins/math64/vexpandD.cl

@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((overloadable, always_inline, weak)) double16
+frexp(double16 x, int16 *p)
+{
+    double16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+frexp(double16 x, __global int16 *p)
+{
+    double16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+frexp(double16 x, __local int16 *p)
+{
+    double16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double8
+frexp(double8 x, int8 *p)
+{
+    double8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+frexp(double8 x, __global int8 *p)
+{
+    double8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+frexp(double8 x, __local int8 *p)
+{
+    double8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double4
+frexp(double4 x, int4 *p)
+{
+    double4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+frexp(double4 x, __global int4 *p)
+{
+    double4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+frexp(double4 x, __local int4 *p)
+{
+    double4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double3
+frexp(double3 x, int3 *p)
+{
+    double3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = frexp(x.s01, &j);
+    i.s01 = j;
+    r.s2 = frexp(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+frexp(double3 x, __global int3 *p)
+{
+    double3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = frexp(x.s01, &j);
+    i.s01 = j;
+    r.s2 = frexp(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+frexp(double3 x, __local int3 *p)
+{
+    double3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = frexp(x.s01, &j);
+    i.s01 = j;
+    r.s2 = frexp(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double2
+frexp(double2 x, int2 *p)
+{
+    double2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+frexp(double2 x, __global int2 *p)
+{
+    double2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+frexp(double2 x, __local int2 *p)
+{
+    double2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = frexp(x.lo, &j);
+    i.lo = j;
+    r.hi = frexp(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double16
+lgamma_r(double16 x, int16 *p)
+{
+    double16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+lgamma_r(double16 x, __global int16 *p)
+{
+    double16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+lgamma_r(double16 x, __local int16 *p)
+{
+    double16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double8
+lgamma_r(double8 x, int8 *p)
+{
+    double8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+lgamma_r(double8 x, __global int8 *p)
+{
+    double8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+lgamma_r(double8 x, __local int8 *p)
+{
+    double8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double4
+lgamma_r(double4 x, int4 *p)
+{
+    double4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+lgamma_r(double4 x, __global int4 *p)
+{
+    double4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+lgamma_r(double4 x, __local int4 *p)
+{
+    double4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double3
+lgamma_r(double3 x, int3 *p)
+{
+    double3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = lgamma_r(x.s01, &j);
+    i.s01 = j;
+    r.s2 = lgamma_r(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+lgamma_r(double3 x, __global int3 *p)
+{
+    double3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = lgamma_r(x.s01, &j);
+    i.s01 = j;
+    r.s2 = lgamma_r(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+lgamma_r(double3 x, __local int3 *p)
+{
+    double3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = lgamma_r(x.s01, &j);
+    i.s01 = j;
+    r.s2 = lgamma_r(x.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double2
+lgamma_r(double2 x, int2 *p)
+{
+    double2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+lgamma_r(double2 x, __global int2 *p)
+{
+    double2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+lgamma_r(double2 x, __local int2 *p)
+{
+    double2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = lgamma_r(x.lo, &j);
+    i.lo = j;
+    r.hi = lgamma_r(x.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double16
+remquo(double16 x, double16 y, int16 *p)
+{
+    double16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+remquo(double16 x, double16 y, __global int16 *p)
+{
+    double16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+remquo(double16 x, double16 y, __local int16 *p)
+{
+    double16 r;
+    int16 i;
+    int8 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double8
+remquo(double8 x, double8 y, int8 *p)
+{
+    double8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+remquo(double8 x, double8 y, __global int8 *p)
+{
+    double8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+remquo(double8 x, double8 y, __local int8 *p)
+{
+    double8 r;
+    int8 i;
+    int4 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double4
+remquo(double4 x, double4 y, int4 *p)
+{
+    double4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+remquo(double4 x, double4 y, __global int4 *p)
+{
+    double4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+remquo(double4 x, double4 y, __local int4 *p)
+{
+    double4 r;
+    int4 i;
+    int2 j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double3
+remquo(double3 x, double3 y, int3 *p)
+{
+    double3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = remquo(x.s01, y.s01, &j);
+    i.s01 = j;
+    r.s2 = remquo(x.s2, y.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+remquo(double3 x, double3 y, __global int3 *p)
+{
+    double3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = remquo(x.s01, y.s01, &j);
+    i.s01 = j;
+    r.s2 = remquo(x.s2, y.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+remquo(double3 x, double3 y, __local int3 *p)
+{
+    double3 r;
+    int3 i;
+    int2 j;
+    int k;
+
+    r.s01 = remquo(x.s01, y.s01, &j);
+    i.s01 = j;
+    r.s2 = remquo(x.s2, y.s2, &k);
+    i.s2 = k;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double2
+remquo(double2 x, double2 y, int2 *p)
+{
+    double2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+remquo(double2 x, double2 y, __global int2 *p)
+{
+    double2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+remquo(double2 x, double2 y, __local int2 *p)
+{
+    double2 r;
+    int2 i;
+    int j;
+    
+
+    r.lo = remquo(x.lo, y.lo, &j);
+    i.lo = j;
+    r.hi = remquo(x.hi, y.hi, &j);
+    i.hi = j;
+
+    *p = i;
+    return r;
+}
+
+#endif
+

diff --git a/amd-builtins/math64/xvexpandD.cl b/amd-builtins/math64/xvexpandD.cl
new file mode 100644
index 0000000..88b50c2
--- /dev/null
+++ b/amd-builtins/math64/xvexpandD.cl

@@ -0,0 +1,908 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// XXX this file can be removed when clp is implemented
+
+__attribute__((overloadable, always_inline, weak)) double16
+fract(double16 x, double16 *p)
+{
+    double16 r;
+    double16 t;
+    double8 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+fract(double16 x, __global double16 *p)
+{
+    double16 r;
+    double16 t;
+    double8 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+fract(double16 x, __local double16 *p)
+{
+    double16 r;
+    double16 t;
+    double8 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double8
+fract(double8 x, double8 *p)
+{
+    double8 r;
+    double8 t;
+    double4 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+fract(double8 x, __global double8 *p)
+{
+    double8 r;
+    double8 t;
+    double4 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+fract(double8 x, __local double8 *p)
+{
+    double8 r;
+    double8 t;
+    double4 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double4
+fract(double4 x, double4 *p)
+{
+    double4 r;
+    double4 t;
+    double2 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+fract(double4 x, __global double4 *p)
+{
+    double4 r;
+    double4 t;
+    double2 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+fract(double4 x, __local double4 *p)
+{
+    double4 r;
+    double4 t;
+    double2 a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double3
+fract(double3 x, double3 *p)
+{
+    double3 r;
+    double3 t;
+    double2 a;
+    double b;
+
+    r.s01 = fract(x.s01, &a);
+    t.s01 = a;
+    r.s2 = fract(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+fract(double3 x, __global double3 *p)
+{
+    double3 r;
+    double3 t;
+    double2 a;
+    double b;
+
+    r.s01 = fract(x.s01, &a);
+    t.s01 = a;
+    r.s2 = fract(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+fract(double3 x, __local double3 *p)
+{
+    double3 r;
+    double3 t;
+    double2 a;
+    double b;
+
+    r.s01 = fract(x.s01, &a);
+    t.s01 = a;
+    r.s2 = fract(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double2
+fract(double2 x, double2 *p)
+{
+    double2 r;
+    double2 t;
+    double a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+fract(double2 x, __global double2 *p)
+{
+    double2 r;
+    double2 t;
+    double a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+fract(double2 x, __local double2 *p)
+{
+    double2 r;
+    double2 t;
+    double a;
+    
+
+    r.lo = fract(x.lo, &a);
+    t.lo = a;
+    r.hi = fract(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double16
+modf(double16 x, double16 *p)
+{
+    double16 r;
+    double16 t;
+    double8 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+modf(double16 x, __global double16 *p)
+{
+    double16 r;
+    double16 t;
+    double8 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+modf(double16 x, __local double16 *p)
+{
+    double16 r;
+    double16 t;
+    double8 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double8
+modf(double8 x, double8 *p)
+{
+    double8 r;
+    double8 t;
+    double4 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+modf(double8 x, __global double8 *p)
+{
+    double8 r;
+    double8 t;
+    double4 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+modf(double8 x, __local double8 *p)
+{
+    double8 r;
+    double8 t;
+    double4 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double4
+modf(double4 x, double4 *p)
+{
+    double4 r;
+    double4 t;
+    double2 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+modf(double4 x, __global double4 *p)
+{
+    double4 r;
+    double4 t;
+    double2 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+modf(double4 x, __local double4 *p)
+{
+    double4 r;
+    double4 t;
+    double2 a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double3
+modf(double3 x, double3 *p)
+{
+    double3 r;
+    double3 t;
+    double2 a;
+    double b;
+
+    r.s01 = modf(x.s01, &a);
+    t.s01 = a;
+    r.s2 = modf(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+modf(double3 x, __global double3 *p)
+{
+    double3 r;
+    double3 t;
+    double2 a;
+    double b;
+
+    r.s01 = modf(x.s01, &a);
+    t.s01 = a;
+    r.s2 = modf(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+modf(double3 x, __local double3 *p)
+{
+    double3 r;
+    double3 t;
+    double2 a;
+    double b;
+
+    r.s01 = modf(x.s01, &a);
+    t.s01 = a;
+    r.s2 = modf(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double2
+modf(double2 x, double2 *p)
+{
+    double2 r;
+    double2 t;
+    double a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+modf(double2 x, __global double2 *p)
+{
+    double2 r;
+    double2 t;
+    double a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+modf(double2 x, __local double2 *p)
+{
+    double2 r;
+    double2 t;
+    double a;
+    
+
+    r.lo = modf(x.lo, &a);
+    t.lo = a;
+    r.hi = modf(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double16
+sincos(double16 x, double16 *p)
+{
+    double16 r;
+    double16 t;
+    double8 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+sincos(double16 x, __global double16 *p)
+{
+    double16 r;
+    double16 t;
+    double8 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double16
+sincos(double16 x, __local double16 *p)
+{
+    double16 r;
+    double16 t;
+    double8 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double8
+sincos(double8 x, double8 *p)
+{
+    double8 r;
+    double8 t;
+    double4 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+sincos(double8 x, __global double8 *p)
+{
+    double8 r;
+    double8 t;
+    double4 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double8
+sincos(double8 x, __local double8 *p)
+{
+    double8 r;
+    double8 t;
+    double4 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double4
+sincos(double4 x, double4 *p)
+{
+    double4 r;
+    double4 t;
+    double2 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+sincos(double4 x, __global double4 *p)
+{
+    double4 r;
+    double4 t;
+    double2 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double4
+sincos(double4 x, __local double4 *p)
+{
+    double4 r;
+    double4 t;
+    double2 a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double3
+sincos(double3 x, double3 *p)
+{
+    double3 r;
+    double3 t;
+    double2 a;
+    double b;
+
+    r.s01 = sincos(x.s01, &a);
+    t.s01 = a;
+    r.s2 = sincos(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+sincos(double3 x, __global double3 *p)
+{
+    double3 r;
+    double3 t;
+    double2 a;
+    double b;
+
+    r.s01 = sincos(x.s01, &a);
+    t.s01 = a;
+    r.s2 = sincos(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double3
+sincos(double3 x, __local double3 *p)
+{
+    double3 r;
+    double3 t;
+    double2 a;
+    double b;
+
+    r.s01 = sincos(x.s01, &a);
+    t.s01 = a;
+    r.s2 = sincos(x.s2, &b);
+    t.s2 = b;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+__attribute__((overloadable, always_inline, weak)) double2
+sincos(double2 x, double2 *p)
+{
+    double2 r;
+    double2 t;
+    double a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+sincos(double2 x, __global double2 *p)
+{
+    double2 r;
+    double2 t;
+    double a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif
+
+#ifndef __clang__
+
+__attribute__((overloadable, always_inline, weak)) double2
+sincos(double2 x, __local double2 *p)
+{
+    double2 r;
+    double2 t;
+    double a;
+    
+
+    r.lo = sincos(x.lo, &a);
+    t.lo = a;
+    r.hi = sincos(x.hi, &a);
+    t.hi = a;
+
+    *p = t;
+    return r;
+}
+
+#endif

diff --git a/amd-builtins/media/bfe.cl b/amd-builtins/media/bfe.cl
new file mode 100644
index 0000000..715f893
--- /dev/null
+++ b/amd-builtins/media/bfe.cl

@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_bfe(uint2 v1, uint2 v2, uint2 v3) 
+{
+    uint2 ret;
+    ret.x =  __hsail_bfe(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_bfe(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_bfe(uint3 v1, uint3 v2, uint3 v3) 
+{
+    uint3 ret;
+    ret.x =  __hsail_bfe(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_bfe(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_bfe(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_bfe(uint4 v1, uint4 v2, uint4 v3) 
+{
+    uint4 ret;
+    ret.x =  __hsail_bfe(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_bfe(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_bfe(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_bfe(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_bfe(uint8 v1, uint8 v2, uint8 v3) 
+{
+    uint8 ret;
+    ret.s0 =  __hsail_bfe(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_bfe(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_bfe(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_bfe(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_bfe(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_bfe(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_bfe(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_bfe(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_bfe(uint16 v1, uint16 v2, uint16 v3) 
+{
+    uint16 ret;
+    ret.s0 =  __hsail_bfe(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_bfe(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_bfe(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_bfe(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_bfe(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_bfe(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_bfe(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_bfe(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_bfe(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_bfe(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_bfe(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_bfe(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_bfe(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_bfe(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_bfe(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_bfe(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_bfe(uint v1, uint v2, uint v3) 
+{
+    return  __hsail_bfe(v1,v2,v3);
+}
+__attribute__((overloadable,always_inline,const)) int2 amd_bfe(int2 v1, uint2 v2, uint2 v3)
+{
+    int2 ret;
+    ret.x =  __hsail_ibfe(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_ibfe(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int3 amd_bfe(int3 v1, uint3 v2, uint3 v3)
+{
+    int3 ret;
+    ret.x =  __hsail_ibfe(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_ibfe(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_ibfe(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int4 amd_bfe(int4 v1, uint4 v2, uint4 v3)
+{
+    int4 ret;
+    ret.x =  __hsail_ibfe(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_ibfe(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_ibfe(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_ibfe(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int8 amd_bfe(int8 v1, uint8 v2, uint8 v3)
+{
+    int8 ret;
+    ret.s0 =  __hsail_ibfe(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_ibfe(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_ibfe(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_ibfe(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_ibfe(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_ibfe(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_ibfe(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_ibfe(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int16 amd_bfe(int16 v1, uint16 v2, uint16 v3)
+{
+    int16 ret;
+    ret.s0 =  __hsail_ibfe(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_ibfe(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_ibfe(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_ibfe(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_ibfe(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_ibfe(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_ibfe(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_ibfe(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_ibfe(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_ibfe(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_ibfe(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_ibfe(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_ibfe(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_ibfe(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_ibfe(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_ibfe(v1.sf,v2.sf,v3.sf);
+    
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int amd_bfe(int v1, uint v2, uint v3)
+{
+    return  __hsail_ibfe(v1,v2,v3);
+}

diff --git a/amd-builtins/media/bfm.cl b/amd-builtins/media/bfm.cl
new file mode 100644
index 0000000..5c77007
--- /dev/null
+++ b/amd-builtins/media/bfm.cl

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_bfm(uint2 v1, uint2 v2 )
+{
+    uint2 ret;
+    ret.x =  __hsail_bfm(v1.x,v2.x);
+    ret.y =  __hsail_bfm(v1.y,v2.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_bfm(uint3 v1, uint3 v2)
+{
+    uint3 ret;
+    ret.x =  __hsail_bfm(v1.x,v2.x);
+    ret.y =  __hsail_bfm(v1.y,v2.y);
+    ret.z =  __hsail_bfm(v1.z,v2.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_bfm(uint4 v1, uint4 v2)
+{
+    uint4 ret;
+    ret.x =  __hsail_bfm(v1.x,v2.x);
+    ret.y =  __hsail_bfm(v1.y,v2.y);
+    ret.z =  __hsail_bfm(v1.z,v2.z);
+    ret.w =  __hsail_bfm(v1.w,v2.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_bfm(uint8 v1, uint8 v2)
+{
+    uint8 ret;
+    ret.s0 =  __hsail_bfm(v1.s0,v2.s0);
+    ret.s1 =  __hsail_bfm(v1.s1,v2.s1);
+    ret.s2 =  __hsail_bfm(v1.s2,v2.s2);
+    ret.s3 =  __hsail_bfm(v1.s3,v2.s3);
+    ret.s4 =  __hsail_bfm(v1.s4,v2.s4) ;
+    ret.s5 =  __hsail_bfm(v1.s5,v2.s5);
+    ret.s6 =  __hsail_bfm(v1.s6,v2.s6 );
+    ret.s7 =  __hsail_bfm(v1.s7,v2.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_bfm(uint16 v1, uint16 v2)
+{
+    uint16 ret;
+    ret.s0 =  __hsail_bfm(v1.s0,v2.s0);
+    ret.s1 =  __hsail_bfm(v1.s1,v2.s1);
+    ret.s2 =  __hsail_bfm(v1.s2,v2.s2);
+    ret.s3 =  __hsail_bfm(v1.s3,v2.s3);
+    ret.s4 =  __hsail_bfm(v1.s4,v2.s4) ;
+    ret.s5 =  __hsail_bfm(v1.s5,v2.s5);
+    ret.s6 =  __hsail_bfm(v1.s6,v2.s6);
+    ret.s7 =  __hsail_bfm(v1.s7,v2.s7);
+    ret.s8 =  __hsail_bfm(v1.s8,v2.s8 );
+    ret.s9 =  __hsail_bfm(v1.s9,v2.s9);
+    ret.sa =  __hsail_bfm(v1.sa,v2.sa);
+    ret.sb =  __hsail_bfm(v1.sb,v2.sb);
+    ret.sc =  __hsail_bfm(v1.sc,v2.sc);
+    ret.sd =  __hsail_bfm(v1.sd,v2.sd);
+    ret.se =  __hsail_bfm(v1.se,v2.se);
+    ret.sf=  __hsail_bfm(v1.sf,v2.sf);
+    
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_bfm(uint v1, uint v2) 
+{
+    return  __hsail_bfm(v1,v2);
+}

diff --git a/amd-builtins/media/bitalign.cl b/amd-builtins/media/bitalign.cl
new file mode 100644
index 0000000..c00f1b1
--- /dev/null
+++ b/amd-builtins/media/bitalign.cl

@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) uint
+amd_bitalign(uint a, uint b, uint c)
+{
+    return __hsail_bitalign_b32(a, b, c);
+}
+
+__attribute__((overloadable, always_inline)) uint2
+amd_bitalign(uint2 a, uint2 b, uint2 c)
+{
+    uint2 ret;
+    ret.x =  __hsail_bitalign_b32(a.x, b.x, c.x);
+    ret.y =  __hsail_bitalign_b32(a.y, b.y, c.y);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint3
+amd_bitalign(uint3 a, uint3 b, uint3 c)
+{
+    uint3 ret;
+    ret.x =  __hsail_bitalign_b32(a.x, b.x, c.x);
+    ret.y =  __hsail_bitalign_b32(a.y, b.y, c.y);
+    ret.z =  __hsail_bitalign_b32(a.z, b.z, c.z);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) uint4
+amd_bitalign(uint4 a, uint4 b, uint4 c)
+{
+    uint4 ret;
+    ret.x =  __hsail_bitalign_b32(a.x, b.x, c.x);
+    ret.y =  __hsail_bitalign_b32(a.y, b.y, c.y);
+    ret.z =  __hsail_bitalign_b32(a.z, b.z, c.z);
+    ret.w =  __hsail_bitalign_b32(a.w, b.w, c.w);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint8
+amd_bitalign(uint8 a, uint8 b, uint8 c)
+{
+    uint8 ret;
+    ret.s0 =  __hsail_bitalign_b32(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_bitalign_b32(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_bitalign_b32(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_bitalign_b32(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_bitalign_b32(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_bitalign_b32(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_bitalign_b32(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_bitalign_b32(a.s7, b.s7, c.s7);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint16
+amd_bitalign(uint16 a, uint16 b, uint16 c)
+{
+    uint16 ret;
+    ret.s0 =  __hsail_bitalign_b32(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_bitalign_b32(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_bitalign_b32(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_bitalign_b32(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_bitalign_b32(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_bitalign_b32(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_bitalign_b32(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_bitalign_b32(a.s7, b.s7, c.s7);
+    ret.s8 =  __hsail_bitalign_b32(a.s8, b.s8, c.s8);
+    ret.s9 =  __hsail_bitalign_b32(a.s9, b.s9, c.s9);
+    ret.sa =  __hsail_bitalign_b32(a.sa, b.sa, c.sa);
+    ret.sb =  __hsail_bitalign_b32(a.sb, b.sb, c.sb);
+    ret.sc =  __hsail_bitalign_b32(a.sc, b.sc, c.sc);
+    ret.sd =  __hsail_bitalign_b32(a.sd, b.sd, c.sd);
+    ret.se =  __hsail_bitalign_b32(a.se, b.se, c.se);
+    ret.sf =  __hsail_bitalign_b32(a.sf, b.sf, c.sf);
+    return ret;
+}
+

diff --git a/amd-builtins/media/bytealign.cl b/amd-builtins/media/bytealign.cl
new file mode 100644
index 0000000..19798fc
--- /dev/null
+++ b/amd-builtins/media/bytealign.cl

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) uint
+amd_bytealign(uint a, uint b, uint c)
+{
+    return __hsail_bytealign_b32(a, b, c);
+}
+
+__attribute__((overloadable, always_inline)) uint2
+amd_bytealign(uint2 a, uint2 b, uint2 c)
+{
+    uint2 ret;
+    ret.x =  __hsail_bytealign_b32(a.x, b.x, c.x);
+    ret.y =  __hsail_bytealign_b32(a.y, b.y, c.y);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint3
+amd_bytealign(uint3 a, uint3 b, uint3 c)
+{
+
+    uint3 ret;
+    ret.x =  __hsail_bytealign_b32(a.x, b.x, c.x);
+    ret.y =  __hsail_bytealign_b32(a.y, b.y, c.y);
+    ret.z =  __hsail_bytealign_b32(a.z, b.z, c.z);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) uint4
+amd_bytealign(uint4 a, uint4 b, uint4 c)
+{
+    uint4 ret;
+    ret.x =  __hsail_bytealign_b32(a.x, b.x, c.x);
+    ret.y =  __hsail_bytealign_b32(a.y, b.y, c.y);
+    ret.z =  __hsail_bytealign_b32(a.z, b.z, c.z);
+    ret.w =  __hsail_bytealign_b32(a.w, b.w, c.w);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint8
+amd_bytealign(uint8 a, uint8 b, uint8 c)
+{
+    uint8 ret;
+    ret.s0 =  __hsail_bytealign_b32(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_bytealign_b32(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_bytealign_b32(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_bytealign_b32(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_bytealign_b32(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_bytealign_b32(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_bytealign_b32(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_bytealign_b32(a.s7, b.s7, c.s7);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint16
+amd_bytealign(uint16 a, uint16 b, uint16 c)
+{
+    uint16 ret;
+    ret.s0 =  __hsail_bytealign_b32(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_bytealign_b32(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_bytealign_b32(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_bytealign_b32(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_bytealign_b32(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_bytealign_b32(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_bytealign_b32(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_bytealign_b32(a.s7, b.s7, c.s7);
+    ret.s8 =  __hsail_bytealign_b32(a.s8, b.s8, c.s8);
+    ret.s9 =  __hsail_bytealign_b32(a.s9, b.s9, c.s9);
+    ret.sa =  __hsail_bytealign_b32(a.sa, b.sa, c.sa);
+    ret.sb =  __hsail_bytealign_b32(a.sb, b.sb, c.sb);
+    ret.sc =  __hsail_bytealign_b32(a.sc, b.sc, c.sc);
+    ret.sd =  __hsail_bytealign_b32(a.sd, b.sd, c.sd);
+    ret.se =  __hsail_bytealign_b32(a.se, b.se, c.se);
+    ret.sf =  __hsail_bytealign_b32(a.sf, b.sf, c.sf);
+    return ret;
+}
+

diff --git a/amd-builtins/media/lerp.cl b/amd-builtins/media/lerp.cl
new file mode 100644
index 0000000..76bd587
--- /dev/null
+++ b/amd-builtins/media/lerp.cl

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) uint
+amd_lerp(uint a, uint b, uint c)
+{
+    return __hsail_lerp_u8x4(a, b, c);
+}
+
+__attribute__((overloadable, always_inline)) uint2
+amd_lerp(uint2 a, uint2 b, uint2 c)
+{
+    uint2 ret;
+    ret.x =  __hsail_lerp_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_lerp_u8x4(a.y, b.y, c.y);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint3
+amd_lerp(uint3 a, uint3 b, uint3 c)
+{
+
+    uint3 ret;
+    ret.x =  __hsail_lerp_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_lerp_u8x4(a.y, b.y, c.y);
+    ret.z =  __hsail_lerp_u8x4(a.z, b.z, c.z);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) uint4
+amd_lerp(uint4 a, uint4 b, uint4 c)
+{
+    uint4 ret;
+    ret.x =  __hsail_lerp_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_lerp_u8x4(a.y, b.y, c.y);
+    ret.z =  __hsail_lerp_u8x4(a.z, b.z, c.z);
+    ret.w =  __hsail_lerp_u8x4(a.w, b.w, c.w);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint8
+amd_lerp(uint8 a, uint8 b, uint8 c)
+{
+    uint8 ret;
+    ret.s0 =  __hsail_lerp_u8x4(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_lerp_u8x4(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_lerp_u8x4(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_lerp_u8x4(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_lerp_u8x4(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_lerp_u8x4(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_lerp_u8x4(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_lerp_u8x4(a.s7, b.s7, c.s7);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint16
+amd_lerp(uint16 a, uint16 b, uint16 c)
+{
+    uint16 ret;
+    ret.s0 =  __hsail_lerp_u8x4(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_lerp_u8x4(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_lerp_u8x4(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_lerp_u8x4(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_lerp_u8x4(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_lerp_u8x4(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_lerp_u8x4(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_lerp_u8x4(a.s7, b.s7, c.s7);
+    ret.s8 =  __hsail_lerp_u8x4(a.s8, b.s8, c.s8);
+    ret.s9 =  __hsail_lerp_u8x4(a.s9, b.s9, c.s9);
+    ret.sa =  __hsail_lerp_u8x4(a.sa, b.sa, c.sa);
+    ret.sb =  __hsail_lerp_u8x4(a.sb, b.sb, c.sb);
+    ret.sc =  __hsail_lerp_u8x4(a.sc, b.sc, c.sc);
+    ret.sd =  __hsail_lerp_u8x4(a.sd, b.sd, c.sd);
+    ret.se =  __hsail_lerp_u8x4(a.se, b.se, c.se);
+    ret.sf =  __hsail_lerp_u8x4(a.sf, b.sf, c.sf);
+    return ret;
+}
+

diff --git a/amd-builtins/media/max3.cl b/amd-builtins/media/max3.cl
new file mode 100644
index 0000000..75b0f48
--- /dev/null
+++ b/amd-builtins/media/max3.cl

@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_max3(uint2 v1, uint2 v2, uint2 v3) 
+{
+    uint2 ret;
+    ret.x =  __hsail_umax3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umax3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_max3(uint3 v1, uint3 v2, uint3 v3) 
+{
+    uint3 ret;
+    ret.x =  __hsail_umax3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umax3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_umax3(v1.z,v2.z,v3.z);
+    return ret;
+}
+
+__attribute__((overloadable,always_inline,const)) uint4 amd_max3(uint4 v1, uint4 v2, uint4 v3) 
+{
+    uint4 ret;
+    ret.x =  __hsail_umax3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umax3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_umax3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_umax3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_max3(uint8 v1, uint8 v2, uint8 v3) 
+{
+    uint8 ret;
+    ret.s0 =  __hsail_umax3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_umax3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_umax3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_umax3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_umax3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_umax3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_umax3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_umax3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_max3(uint16 v1, uint16 v2, uint16 v3) 
+{
+    uint16 ret;
+    ret.s0 =  __hsail_umax3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_umax3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_umax3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_umax3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_umax3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_umax3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_umax3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_umax3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_umax3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_umax3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_umax3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_umax3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_umax3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_umax3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_umax3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_umax3(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_max3(uint v1, uint v2, uint v3) 
+{
+    return  __hsail_umax3(v1,v2,v3) ;
+}
+__attribute__((overloadable,always_inline,const)) float2 amd_max3(float2 v1, float2 v2, float2 v3) 
+{
+    float2 ret;
+    ret.x =  __hsail_f32_max3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_max3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float3 amd_max3(float3 v1, float3 v2, float3 v3) 
+{
+    float3 ret;
+    ret.x =  __hsail_f32_max3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_max3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_f32_max3(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float4 amd_max3(float4 v1, float4 v2, float4 v3) 
+{
+    float4 ret;
+    ret.x =  __hsail_f32_max3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_max3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_f32_max3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_f32_max3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float8 amd_max3(float8 v1, float8 v2, float8 v3) 
+{
+    float8 ret;
+    ret.s0 =  __hsail_f32_max3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_f32_max3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_f32_max3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_f32_max3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_f32_max3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_f32_max3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_f32_max3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_f32_max3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float16 amd_max3(float16 v1, float16 v2, float16 v3) 
+{
+    float16 ret;
+    ret.s0 =  __hsail_f32_max3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_f32_max3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_f32_max3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_f32_max3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_f32_max3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_f32_max3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_f32_max3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_f32_max3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_f32_max3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_f32_max3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_f32_max3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_f32_max3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_f32_max3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_f32_max3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_f32_max3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_f32_max3(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float amd_max3(float v1, float v2, float v3) 
+{
+    return  __hsail_f32_max3(v1,v2,v3);
+}
+__attribute__((overloadable,always_inline,const)) int2 amd_max3(int2 v1, int2 v2, int2 v3)
+{
+    int2 ret;
+    ret.x =  __hsail_imax3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imax3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int3 amd_max3(int3 v1, int3 v2, int3 v3)
+{
+    int3 ret;
+    ret.x =  __hsail_imax3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imax3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_imax3(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int4 amd_max3(int4 v1, int4 v2, int4 v3)
+{
+    int4 ret;
+    ret.x =  __hsail_imax3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imax3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_imax3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_imax3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int8 amd_max3(int8 v1, int8 v2, int8 v3)
+{
+    int8 ret;
+    ret.s0 =  __hsail_imax3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_imax3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_imax3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_imax3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_imax3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_imax3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_imax3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_imax3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int16 amd_max3(int16 v1, int16 v2, int16 v3)
+{
+    int16 ret;
+    ret.s0 =  __hsail_imax3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_imax3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_imax3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_imax3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_imax3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_imax3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_imax3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_imax3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_imax3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_imax3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_imax3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_imax3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_imax3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_imax3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_imax3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_imax3(v1.sf,v2.sf,v3.sf);
+    
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int amd_max3(int v1, int v2, int v3)
+{
+    return  __hsail_imax3(v1,v2,v3);
+}

diff --git a/amd-builtins/media/media.h b/amd-builtins/media/media.h
new file mode 100644
index 0000000..eb12c55
--- /dev/null
+++ b/amd-builtins/media/media.h

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+
+extern __attribute__((const)) uint __hsail_bitalign_b32(uint, uint, uint);
+
+extern __attribute__((const)) uint __hsail_bytealign_b32(uint, uint, uint);
+
+extern __attribute__((pure)) uint  __hsail_packcvt_u8x4_f32(float,float,float,float);
+
+extern __attribute__((pure)) uint __hsail_lerp_u8x4(uint,uint,uint);
+
+extern __attribute__((pure)) uint __hsail_sad_u32_u8x4(uint,uint,uint);
+
+extern __attribute__((pure)) uint __hsail_sadhi_u16x2_u8x4(uint,uint,uint);
+
+extern __attribute__((pure)) float __hsail_unpackcvt_f32_u8x4(uint,uint);
+
+extern __attribute__((const)) uint __hsail_msad(uint,uint,uint);
+
+extern __attribute__((const)) uint __hsail_sadd(uint,uint,uint);
+
+extern __attribute__((const)) uint __hsail_sadw(uint,uint,uint);
+
+extern __attribute__((const)) uint __hsail_umin3(uint,uint,uint);
+
+extern __attribute__((const)) int __hsail_imin3(int,int,int);
+
+extern __attribute__((const)) uint __hsail_umax3(uint,uint,uint);
+
+extern __attribute__((const)) int __hsail_imax3(int,int,int);
+
+extern __attribute__((const)) uint __hsail_umedian3(uint,uint,uint);
+
+extern __attribute__((const)) int __hsail_imedian3(int,int,int);
+
+extern __attribute__((const)) uint __hsail_bfe(uint,uint,uint);
+
+extern __attribute__((const)) float __hsail_f32_min3(float,float,float);
+
+extern __attribute__((const)) float __hsail_f32_max3(float,float,float);
+
+extern __attribute__((const)) float __hsail_f32_median3(float,float,float);
+
+extern __attribute__((const)) ulong __hsail_mqsad(ulong,uint,ulong);
+
+extern __attribute__((const)) ulong __hsail_qsad(ulong,uint,ulong);
+
+extern __attribute__((const)) uint __hsail_bfm(uint,uint);
+
+extern __attribute__((const)) int __hsail_ibfe(int,uint,uint);

diff --git a/amd-builtins/media/median3.cl b/amd-builtins/media/median3.cl
new file mode 100644
index 0000000..e446538
--- /dev/null
+++ b/amd-builtins/media/median3.cl

@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_median3(uint2 v1, uint2 v2, uint2 v3) 
+{
+    uint2 ret;
+    ret.x =  __hsail_umedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umedian3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_median3(uint3 v1, uint3 v2, uint3 v3) 
+{
+    uint3 ret;
+    ret.x =  __hsail_umedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umedian3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_umedian3(v1.z,v2.z, v3.z);
+    return ret;
+}
+
+__attribute__((overloadable,always_inline,const)) uint4 amd_median3(uint4 v1, uint4 v2, uint4 v3) 
+{
+    uint4 ret;
+    ret.x =  __hsail_umedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umedian3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_umedian3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_umedian3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_median3(uint8 v1, uint8 v2, uint8 v3) 
+{
+    uint8 ret;
+    ret.s0 =  __hsail_umedian3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_umedian3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_umedian3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_umedian3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_umedian3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_umedian3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_umedian3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_umedian3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_median3(uint16 v1, uint16 v2, uint16 v3) 
+{
+    uint16 ret;
+    ret.s0 =  __hsail_umedian3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_umedian3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_umedian3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_umedian3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_umedian3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_umedian3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_umedian3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_umedian3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_umedian3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_umedian3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_umedian3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_umedian3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_umedian3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_umedian3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_umedian3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_umedian3(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_median3(uint v1, uint v2, uint v3) 
+{
+    return  __hsail_umedian3(v1,v2,v3) ;
+}
+__attribute__((overloadable,always_inline,const)) float2 amd_median3(float2 v1, float2 v2, float2 v3) 
+{
+    float2 ret;
+    ret.x =  __hsail_f32_median3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_median3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float3 amd_median3(float3 v1, float3 v2, float3 v3) 
+{
+    float3 ret;
+    ret.x =  __hsail_f32_median3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_median3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_f32_median3(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float4 amd_median3(float4 v1, float4 v2, float4 v3) 
+{
+    float4 ret;
+    ret.x =  __hsail_f32_median3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_median3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_f32_median3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_f32_median3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float8 amd_median3(float8 v1, float8 v2, float8 v3) 
+{
+    float8 ret;
+    ret.s0 =  __hsail_f32_median3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_f32_median3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_f32_median3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_f32_median3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_f32_median3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_f32_median3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_f32_median3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_f32_median3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float16 amd_median3(float16 v1, float16 v2, float16 v3) 
+{
+    float16 ret;
+    ret.s0 =  __hsail_f32_median3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_f32_median3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_f32_median3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_f32_median3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_f32_median3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_f32_median3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_f32_median3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_f32_median3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_f32_median3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_f32_median3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_f32_median3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_f32_median3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_f32_median3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_f32_median3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_f32_median3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_f32_median3(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float amd_median3(float v1, float v2, float v3) 
+{
+    return  __hsail_f32_median3(v1,v2,v3);
+}
+__attribute__((overloadable,always_inline,const)) int2 amd_median3(int2 v1, int2 v2, int2 v3)
+{
+    int2 ret;
+    ret.x =  __hsail_imedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imedian3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int3 amd_median3(int3 v1, int3 v2, int3 v3)
+{
+    int3 ret;
+    ret.x =  __hsail_imedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imedian3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_imedian3(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int4 amd_median3(int4 v1, int4 v2, int4 v3)
+{
+    int4 ret;
+    ret.x =  __hsail_imedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imedian3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_imedian3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_imedian3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int8 amd_median3(int8 v1, int8 v2, int8 v3)
+{
+    int8 ret;
+    ret.s0 =  __hsail_imedian3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_imedian3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_imedian3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_imedian3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_imedian3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_imedian3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_imedian3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_imedian3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int16 amd_median3(int16 v1, int16 v2, int16 v3)
+{
+    int16 ret;
+    ret.s0 =  __hsail_imedian3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_imedian3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_imedian3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_imedian3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_imedian3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_imedian3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_imedian3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_imedian3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_imedian3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_imedian3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_imedian3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_imedian3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_imedian3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_imedian3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_imedian3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_imedian3(v1.sf,v2.sf,v3.sf);
+    
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int amd_median3(int v1, int v2, int v3)
+{
+    return  __hsail_imedian3(v1,v2,v3);
+}

diff --git a/amd-builtins/media/min3.cl b/amd-builtins/media/min3.cl
new file mode 100644
index 0000000..90901db
--- /dev/null
+++ b/amd-builtins/media/min3.cl

@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_min3(uint2 v1, uint2 v2, uint2 v3)
+{
+    uint2 ret;
+    ret.x =  __hsail_umin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umin3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_min3(uint3 v1, uint3 v2, uint3 v3)
+{
+    uint3 ret;
+    ret.x =  __hsail_umin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umin3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_umin3(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_min3(uint4 v1, uint4 v2, uint4 v3)
+{
+    uint4 ret;
+    ret.x =  __hsail_umin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umin3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_umin3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_umin3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_min3(uint8 v1, uint8 v2, uint8 v3)
+{
+    uint8 ret;
+    ret.s0 =  __hsail_umin3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_umin3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_umin3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_umin3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_umin3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_umin3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_umin3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_umin3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_min3(uint16 v1, uint16 v2, uint16 v3)
+{
+    uint16 ret;
+    ret.s0 =  __hsail_umin3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_umin3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_umin3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_umin3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_umin3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_umin3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_umin3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_umin3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_umin3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_umin3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_umin3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_umin3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_umin3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_umin3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_umin3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_umin3(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_min3(uint v1, uint v2, uint v3)
+{
+    return  __hsail_umin3(v1,v2,v3);
+}
+__attribute__((overloadable,always_inline,const)) float2 amd_min3(float2 v1, float2 v2, float2 v3)
+{
+    float2 ret;
+    ret.x =  __hsail_f32_min3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_min3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float3 amd_min3(float3 v1, float3 v2, float3 v3)
+{
+    float3 ret;
+    ret.x =  __hsail_f32_min3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_min3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_f32_min3(v1.z,v2.z, v3.z);
+    return ret;
+}
+
+__attribute__((overloadable,always_inline,const)) float4 amd_min3(float4 v1, float4 v2, float4 v3)
+{
+    float4 ret;
+    ret.x =  __hsail_f32_min3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_min3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_f32_min3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_f32_min3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float8 amd_min3(float8 v1, float8 v2, float8 v3)
+{
+    float8 ret;
+    ret.s0 =  __hsail_f32_min3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_f32_min3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_f32_min3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_f32_min3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_f32_min3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_f32_min3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_f32_min3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_f32_min3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float16 amd_min3(float16 v1, float16 v2, float16 v3)
+{
+    float16 ret;
+    ret.s0 =  __hsail_f32_min3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_f32_min3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_f32_min3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_f32_min3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_f32_min3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_f32_min3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_f32_min3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_f32_min3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_f32_min3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_f32_min3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_f32_min3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_f32_min3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_f32_min3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_f32_min3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_f32_min3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_f32_min3(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float amd_min3(float v1, float v2, float v3)
+{
+    return  __hsail_f32_min3(v1,v2,v3);
+}
+__attribute__((overloadable,always_inline,const)) int2 amd_min3(int2 v1, int2 v2, int2 v3)
+{
+    int2 ret;
+    ret.x =  __hsail_imin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imin3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int3 amd_min3(int3 v1, int3 v2, int3 v3)
+{
+    int3 ret;
+    ret.x =  __hsail_imin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imin3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_imin3(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int4 amd_min3(int4 v1, int4 v2, int4 v3)
+{
+    int4 ret;
+    ret.x =  __hsail_imin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imin3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_imin3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_imin3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int8 amd_min3(int8 v1, int8 v2, int8 v3)
+{
+    int8 ret;
+    ret.s0 =  __hsail_imin3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_imin3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_imin3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_imin3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_imin3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_imin3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_imin3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_imin3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int16 amd_min3(int16 v1, int16 v2, int16 v3)
+{
+    int16 ret;
+    ret.s0 =  __hsail_imin3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_imin3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_imin3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_imin3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_imin3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_imin3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_imin3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_imin3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_imin3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_imin3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_imin3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_imin3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_imin3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_imin3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_imin3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_imin3(v1.sf,v2.sf,v3.sf);
+    
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int amd_min3(int v1, int v2, int v3)
+{
+    return  __hsail_imin3(v1,v2,v3);
+}

diff --git a/amd-builtins/media/mqsad.cl b/amd-builtins/media/mqsad.cl
new file mode 100644
index 0000000..a9b551c
--- /dev/null
+++ b/amd-builtins/media/mqsad.cl

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) ulong2 amd_mqsad(ulong2 v1, uint2 v2, ulong2 v3) 
+{
+    ulong2 ret;
+    ret.x =  __hsail_mqsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_mqsad(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong3 amd_mqsad(ulong3 v1, uint3 v2, ulong3 v3) 
+{
+    ulong3 ret;
+    ret.x =  __hsail_mqsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_mqsad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_mqsad(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong4 amd_mqsad(ulong4 v1, uint4 v2, ulong4 v3) 
+{
+    ulong4 ret;
+    ret.x =  __hsail_mqsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_mqsad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_mqsad(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_mqsad(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong8 amd_mqsad(ulong8 v1, uint8 v2, ulong8 v3) 
+{
+    ulong8 ret;
+    ret.s0 =  __hsail_mqsad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_mqsad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_mqsad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_mqsad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_mqsad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_mqsad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_mqsad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_mqsad(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong16 amd_mqsad(ulong16 v1, uint16 v2, ulong16 v3) 
+{
+    ulong16 ret;
+    ret.s0 =  __hsail_mqsad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_mqsad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_mqsad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_mqsad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_mqsad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_mqsad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_mqsad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_mqsad(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_mqsad(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_mqsad(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_mqsad(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_mqsad(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_mqsad(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_mqsad(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_mqsad(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_mqsad(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong amd_mqsad(ulong v1, uint v2, ulong v3) 
+{
+    return  __hsail_mqsad(v1,v2,v3);
+}
+

diff --git a/amd-builtins/media/msad.cl b/amd-builtins/media/msad.cl
new file mode 100644
index 0000000..86b4dbc
--- /dev/null
+++ b/amd-builtins/media/msad.cl

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_msad(uint2 v1, uint2 v2, uint2 v3) 
+{
+    uint2 ret;
+    ret.x =  __hsail_msad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_msad(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_msad(uint3 v1, uint3 v2, uint3 v3) 
+{
+    uint3 ret;
+    ret.x =  __hsail_msad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_msad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_msad(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_msad(uint4 v1, uint4 v2, uint4 v3) 
+{
+    uint4 ret;
+    ret.x =  __hsail_msad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_msad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_msad(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_msad(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_msad(uint8 v1, uint8 v2, uint8 v3) 
+{
+    uint8 ret;
+    ret.s0 =  __hsail_msad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_msad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_msad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_msad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_msad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_msad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_msad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_msad(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_msad(uint16 v1, uint16 v2, uint16 v3) 
+{
+    uint16 ret;
+    ret.s0 =  __hsail_msad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_msad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_msad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_msad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_msad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_msad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_msad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_msad(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_msad(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_msad(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_msad(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_msad(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_msad(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_msad(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_msad(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_msad(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_msad(uint v1, uint v2, uint v3) 
+{
+    return  __hsail_msad(v1,v2,v3);
+}

diff --git a/amd-builtins/media/pack.cl b/amd-builtins/media/pack.cl
new file mode 100644
index 0000000..11e494e
--- /dev/null
+++ b/amd-builtins/media/pack.cl

@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+#ifdef __clang__
+__attribute__((overloadable, always_inline))
+#else
+__attribute__((always_inline))
+#endif
+ uint amd_pack(float4 v)
+{
+    return __hsail_packcvt_u8x4_f32(v.s0,v.s1,v.s2,v.s3);
+}

diff --git a/amd-builtins/media/qsad.cl b/amd-builtins/media/qsad.cl
new file mode 100644
index 0000000..096cc6f
--- /dev/null
+++ b/amd-builtins/media/qsad.cl

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) ulong2 amd_qsad(ulong2 v1, uint2 v2, ulong2 v3) 
+{
+    ulong2 ret;
+    ret.x =  __hsail_qsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_qsad(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong3 amd_qsad(ulong3 v1, uint3 v2, ulong3 v3) 
+{
+    ulong3 ret;
+    ret.x =  __hsail_qsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_qsad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_qsad(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong4 amd_qsad(ulong4 v1, uint4 v2, ulong4 v3) 
+{
+    ulong4 ret;
+    ret.x =  __hsail_qsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_qsad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_qsad(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_qsad(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong8 amd_qsad(ulong8 v1, uint8 v2, ulong8 v3) 
+{
+    ulong8 ret;
+    ret.s0 =  __hsail_qsad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_qsad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_qsad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_qsad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_qsad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_qsad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_qsad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_qsad(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong16 amd_qsad(ulong16 v1, uint16 v2, ulong16 v3) 
+{
+    ulong16 ret;
+    ret.s0 =  __hsail_qsad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_qsad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_qsad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_qsad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_qsad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_qsad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_qsad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_qsad(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_qsad(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_qsad(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_qsad(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_qsad(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_qsad(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_qsad(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_qsad(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_qsad(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong amd_qsad(ulong v1, uint v2, ulong v3) 
+{
+    return  __hsail_qsad(v1,v2,v3);
+}
+

diff --git a/amd-builtins/media/sad.cl b/amd-builtins/media/sad.cl
new file mode 100644
index 0000000..1f81bf4
--- /dev/null
+++ b/amd-builtins/media/sad.cl

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) uint
+amd_sad(uint a, uint b, uint c)
+{
+    return __hsail_sad_u32_u8x4(a, b, c);
+}
+
+__attribute__((overloadable, always_inline)) uint2
+amd_sad(uint2 a, uint2 b, uint2 c)
+{
+    uint2 ret;
+    ret.x =  __hsail_sad_u32_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sad_u32_u8x4(a.y, b.y, c.y);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint3
+amd_sad(uint3 a, uint3 b, uint3 c)
+{
+
+    uint3 ret;
+    ret.x =  __hsail_sad_u32_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sad_u32_u8x4(a.y, b.y, c.y);
+    ret.z =  __hsail_sad_u32_u8x4(a.z, b.z, c.z);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) uint4
+amd_sad(uint4 a, uint4 b, uint4 c)
+{
+    uint4 ret;
+    ret.x =  __hsail_sad_u32_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sad_u32_u8x4(a.y, b.y, c.y);
+    ret.z =  __hsail_sad_u32_u8x4(a.z, b.z, c.z);
+    ret.w =  __hsail_sad_u32_u8x4(a.w, b.w, c.w);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint8
+amd_sad(uint8 a, uint8 b, uint8 c)
+{
+    uint8 ret;
+    ret.s0 =  __hsail_sad_u32_u8x4(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_sad_u32_u8x4(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_sad_u32_u8x4(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_sad_u32_u8x4(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_sad_u32_u8x4(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_sad_u32_u8x4(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_sad_u32_u8x4(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_sad_u32_u8x4(a.s7, b.s7, c.s7);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint16
+amd_sad(uint16 a, uint16 b, uint16 c)
+{
+    uint16 ret;
+    ret.s0 =  __hsail_sad_u32_u8x4(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_sad_u32_u8x4(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_sad_u32_u8x4(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_sad_u32_u8x4(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_sad_u32_u8x4(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_sad_u32_u8x4(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_sad_u32_u8x4(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_sad_u32_u8x4(a.s7, b.s7, c.s7);
+    ret.s8 =  __hsail_sad_u32_u8x4(a.s8, b.s8, c.s8);
+    ret.s9 =  __hsail_sad_u32_u8x4(a.s9, b.s9, c.s9);
+    ret.sa =  __hsail_sad_u32_u8x4(a.sa, b.sa, c.sa);
+    ret.sb =  __hsail_sad_u32_u8x4(a.sb, b.sb, c.sb);
+    ret.sc =  __hsail_sad_u32_u8x4(a.sc, b.sc, c.sc);
+    ret.sd =  __hsail_sad_u32_u8x4(a.sd, b.sd, c.sd);
+    ret.se =  __hsail_sad_u32_u8x4(a.se, b.se, c.se);
+    ret.sf =  __hsail_sad_u32_u8x4(a.sf, b.sf, c.sf);
+    return ret;
+}
+

diff --git a/amd-builtins/media/sad4.cl b/amd-builtins/media/sad4.cl
new file mode 100644
index 0000000..38a60a4
--- /dev/null
+++ b/amd-builtins/media/sad4.cl

@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+#ifdef __clang__
+__attribute__((overloadable, always_inline))
+#else
+__attribute__((always_inline))
+#endif
+uint amd_sad4(uint4 x, uint4 y, uint z)
+{
+    uint a = __hsail_sad_u32_u8x4(x.s0,y.s0,z);
+    a =  __hsail_sad_u32_u8x4(x.s1,y.s1,a);
+    a =  __hsail_sad_u32_u8x4(x.s2,y.s2,a);
+
+    return  __hsail_sad_u32_u8x4(x.s3,y.s3,a);
+}
+

diff --git a/amd-builtins/media/sadd.cl b/amd-builtins/media/sadd.cl
new file mode 100644
index 0000000..36c8c05
--- /dev/null
+++ b/amd-builtins/media/sadd.cl

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_sadd(uint2 v1, uint2 v2, uint2 v3) 
+{
+    uint2 ret;
+    ret.x =  __hsail_sadd(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadd(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_sadd(uint3 v1, uint3 v2, uint3 v3) 
+{
+    uint3 ret;
+    ret.x =  __hsail_sadd(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadd(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_sadd(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_sadd(uint4 v1, uint4 v2, uint4 v3) 
+{
+    uint4 ret;
+    ret.x =  __hsail_sadd(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadd(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_sadd(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_sadd(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_sadd(uint8 v1, uint8 v2, uint8 v3) 
+{
+    uint8 ret;
+    ret.s0 =  __hsail_sadd(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_sadd(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_sadd(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_sadd(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_sadd(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_sadd(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_sadd(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_sadd(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_sadd(uint16 v1, uint16 v2, uint16 v3) 
+{
+    uint16 ret;
+    ret.s0 =  __hsail_sadd(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_sadd(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_sadd(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_sadd(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_sadd(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_sadd(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_sadd(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_sadd(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_sadd(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_sadd(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_sadd(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_sadd(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_sadd(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_sadd(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_sadd(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_sadd(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_sadd(uint v1, uint v2, uint v3) 
+{
+    return  __hsail_sadd(v1,v2,v3);
+}

diff --git a/amd-builtins/media/sadhi.cl b/amd-builtins/media/sadhi.cl
new file mode 100644
index 0000000..357e942
--- /dev/null
+++ b/amd-builtins/media/sadhi.cl

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) uint
+amd_sadhi(uint a, uint b, uint c)
+{
+    return __hsail_sadhi_u16x2_u8x4(a, b, c);
+}
+
+__attribute__((overloadable, always_inline)) uint2
+amd_sadhi(uint2 a, uint2 b, uint2 c)
+{
+    uint2 ret;
+    ret.x =  __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint3
+amd_sadhi(uint3 a, uint3 b, uint3 c)
+{
+
+    uint3 ret;
+    ret.x =  __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y);
+    ret.z =  __hsail_sadhi_u16x2_u8x4(a.z, b.z, c.z);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) uint4
+amd_sadhi(uint4 a, uint4 b, uint4 c)
+{
+    uint4 ret;
+    ret.x =  __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y);
+    ret.z =  __hsail_sadhi_u16x2_u8x4(a.z, b.z, c.z);
+    ret.w =  __hsail_sadhi_u16x2_u8x4(a.w, b.w, c.w);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint8
+amd_sadhi(uint8 a, uint8 b, uint8 c)
+{
+    uint8 ret;
+    ret.s0 =  __hsail_sadhi_u16x2_u8x4(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_sadhi_u16x2_u8x4(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_sadhi_u16x2_u8x4(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_sadhi_u16x2_u8x4(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_sadhi_u16x2_u8x4(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_sadhi_u16x2_u8x4(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_sadhi_u16x2_u8x4(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_sadhi_u16x2_u8x4(a.s7, b.s7, c.s7);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint16
+amd_sadhi(uint16 a, uint16 b, uint16 c)
+{
+    uint16 ret;
+    ret.s0 =  __hsail_sadhi_u16x2_u8x4(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_sadhi_u16x2_u8x4(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_sadhi_u16x2_u8x4(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_sadhi_u16x2_u8x4(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_sadhi_u16x2_u8x4(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_sadhi_u16x2_u8x4(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_sadhi_u16x2_u8x4(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_sadhi_u16x2_u8x4(a.s7, b.s7, c.s7);
+    ret.s8 =  __hsail_sadhi_u16x2_u8x4(a.s8, b.s8, c.s8);
+    ret.s9 =  __hsail_sadhi_u16x2_u8x4(a.s9, b.s9, c.s9);
+    ret.sa =  __hsail_sadhi_u16x2_u8x4(a.sa, b.sa, c.sa);
+    ret.sb =  __hsail_sadhi_u16x2_u8x4(a.sb, b.sb, c.sb);
+    ret.sc =  __hsail_sadhi_u16x2_u8x4(a.sc, b.sc, c.sc);
+    ret.sd =  __hsail_sadhi_u16x2_u8x4(a.sd, b.sd, c.sd);
+    ret.se =  __hsail_sadhi_u16x2_u8x4(a.se, b.se, c.se);
+    ret.sf =  __hsail_sadhi_u16x2_u8x4(a.sf, b.sf, c.sf);
+    return ret;
+}
+

diff --git a/amd-builtins/media/sadw.cl b/amd-builtins/media/sadw.cl
new file mode 100644
index 0000000..3d13b7a
--- /dev/null
+++ b/amd-builtins/media/sadw.cl

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_sadw(uint2 v1, uint2 v2, uint2 v3) 
+{
+    uint2 ret;
+    ret.x =  __hsail_sadw(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadw(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_sadw(uint3 v1, uint3 v2, uint3 v3) 
+{
+    uint3 ret;
+    ret.x =  __hsail_sadw(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadw(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_sadw(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_sadw(uint4 v1, uint4 v2, uint4 v3) 
+{
+    uint4 ret;
+    ret.x =  __hsail_sadw(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadw(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_sadw(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_sadw(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_sadw(uint8 v1, uint8 v2, uint8 v3) 
+{
+    uint8 ret;
+    ret.s0 =  __hsail_sadw(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_sadw(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_sadw(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_sadw(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_sadw(v1.s4,v2.s4,v3.s4 );
+    ret.s5 =  __hsail_sadw(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_sadw(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_sadw(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_sadw(uint16 v1, uint16 v2, uint16 v3) 
+{
+    uint16 ret;
+    ret.s0 =  __hsail_sadw(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_sadw(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_sadw(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_sadw(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_sadw(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_sadw(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_sadw(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_sadw(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_sadw(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_sadw(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_sadw(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_sadw(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_sadw(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_sadw(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_sadw(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_sadw(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_sadw(uint v1, uint v2, uint v3) 
+{
+    return  __hsail_sadw(v1,v2,v3);
+}

diff --git a/amd-builtins/media/unpack.cl b/amd-builtins/media/unpack.cl
new file mode 100644
index 0000000..96a35b2
--- /dev/null
+++ b/amd-builtins/media/unpack.cl

@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack0(uint a)
+{
+    return __hsail_unpackcvt_f32_u8x4(a,0);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack0(uint2 a)
+{
+    float2 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,0);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,0);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack0(uint3 a)
+{
+
+    float3  ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,0);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,0);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,0);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack0(uint4 a)
+{
+    float4 ret;
+    ret.x = __hsail_unpackcvt_f32_u8x4(a.x,0);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,0);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,0);
+    ret.w =  __hsail_unpackcvt_f32_u8x4(a.w,0);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack0(uint8 a)
+{
+    float8 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,0);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,0);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,0);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,0);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,0);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,0);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,0);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,0);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack0(uint16 a)
+{
+    float16 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,0);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,0);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,0);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,0);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,0);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,0);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,0);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,0);
+    ret.s8 =  __hsail_unpackcvt_f32_u8x4(a.s8,0);
+    ret.s9 =  __hsail_unpackcvt_f32_u8x4(a.s9,0);
+    ret.sa =  __hsail_unpackcvt_f32_u8x4(a.sa,0);
+    ret.sb =  __hsail_unpackcvt_f32_u8x4(a.sb,0);
+    ret.sc =  __hsail_unpackcvt_f32_u8x4(a.sc,0);
+    ret.sd =  __hsail_unpackcvt_f32_u8x4(a.sd,0);
+    ret.se =  __hsail_unpackcvt_f32_u8x4(a.se,0);
+    ret.sf =  __hsail_unpackcvt_f32_u8x4(a.sf,0);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack1(uint a)
+{
+    return __hsail_unpackcvt_f32_u8x4(a,1);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack1(uint2 a)
+{
+    float2 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,1);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,1);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack1(uint3 a)
+{
+
+    float3  ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,1);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,1);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,1);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack1(uint4 a)
+{
+    float4 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,1);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,1);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,1);
+    ret.w =  __hsail_unpackcvt_f32_u8x4(a.w,1);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack1(uint8 a)
+{
+    float8 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,1);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,1);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,1);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,1);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,1);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,1);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,1);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,1);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack1(uint16 a)
+{
+    float16 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,1);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,1);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,1);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,1);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,1);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,1);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,1);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,1);
+    ret.s8 =  __hsail_unpackcvt_f32_u8x4(a.s8,1);
+    ret.s9 =  __hsail_unpackcvt_f32_u8x4(a.s9,1);
+    ret.sa =  __hsail_unpackcvt_f32_u8x4(a.sa,1);
+    ret.sb =  __hsail_unpackcvt_f32_u8x4(a.sb,1);
+    ret.sc =  __hsail_unpackcvt_f32_u8x4(a.sc,1);
+    ret.sd =  __hsail_unpackcvt_f32_u8x4(a.sd,1);
+    ret.se =  __hsail_unpackcvt_f32_u8x4(a.se,1);
+    ret.sf =  __hsail_unpackcvt_f32_u8x4(a.sf,1);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack2(uint a)
+{
+    return __hsail_unpackcvt_f32_u8x4(a,2);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack2(uint2 a)
+{
+    float2 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,2);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,2);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack2(uint3 a)
+{
+
+    float3  ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,2);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,2);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,2);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack2(uint4 a)
+{
+    float4 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,2);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,2);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,2);
+    ret.w =  __hsail_unpackcvt_f32_u8x4(a.w,2);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack2(uint8 a)
+{
+    float8 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,2);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,2);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,2);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,2);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,2);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,2);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,2);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,2);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack2(uint16 a)
+{
+    float16 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,2);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,2);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,2);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,2);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,2);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,2);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,2);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,2);
+    ret.s8 =  __hsail_unpackcvt_f32_u8x4(a.s8,2);
+    ret.s9 =  __hsail_unpackcvt_f32_u8x4(a.s9,2);
+    ret.sa =  __hsail_unpackcvt_f32_u8x4(a.sa,2);
+    ret.sb =  __hsail_unpackcvt_f32_u8x4(a.sb,2);
+    ret.sc =  __hsail_unpackcvt_f32_u8x4(a.sc,2);
+    ret.sd =  __hsail_unpackcvt_f32_u8x4(a.sd,2);
+    ret.se =  __hsail_unpackcvt_f32_u8x4(a.se,2);
+    ret.sf =  __hsail_unpackcvt_f32_u8x4(a.sf,2);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack3(uint a)
+{
+    return __hsail_unpackcvt_f32_u8x4(a,3);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack3(uint2 a)
+{
+    float2 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,3);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,3);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack3(uint3 a)
+{
+
+    float3  ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,3);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,3);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,3);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack3(uint4 a)
+{
+    float4 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,3);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,3);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,3);
+    ret.w =  __hsail_unpackcvt_f32_u8x4(a.w,3);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack3(uint8 a)
+{
+    float8 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,3);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,3);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,3);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,3);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,3);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,3);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,3);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,3);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack3(uint16 a)
+{
+    float16 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,3);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,3);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,3);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,3);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,3);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,3);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,3);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,3);
+    ret.s8 =  __hsail_unpackcvt_f32_u8x4(a.s8,3);
+    ret.s9 =  __hsail_unpackcvt_f32_u8x4(a.s9,3);
+    ret.sa =  __hsail_unpackcvt_f32_u8x4(a.sa,3);
+    ret.sb =  __hsail_unpackcvt_f32_u8x4(a.sb,3);
+    ret.sc =  __hsail_unpackcvt_f32_u8x4(a.sc,3);
+    ret.sd =  __hsail_unpackcvt_f32_u8x4(a.sd,3);
+    ret.se =  __hsail_unpackcvt_f32_u8x4(a.se,3);
+    ret.sf =  __hsail_unpackcvt_f32_u8x4(a.sf,3);
+    return ret;
+}

diff --git a/amd-builtins/misc/amdil-to-hsail.cl b/amd-builtins/misc/amdil-to-hsail.cl
new file mode 100644
index 0000000..43d3922
--- /dev/null
+++ b/amd-builtins/misc/amdil-to-hsail.cl

@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+// __amdil_ to __hsail_ translation  
+
+// HSAIL intrinsic functions used by math32 functions
+extern __attribute__((pure)) float __hsail_fma_f32(float, float, float);
+extern __attribute__((pure)) float __hsail_nfma_f32(float, float, float);
+extern __attribute__((pure)) float __hsail_min_f32(float, float);
+extern __attribute__((pure)) float __hsail_max_f32(float, float);
+extern __attribute__((pure)) float __hsail_ftz_f32(float);
+extern __attribute__((pure)) float __hsail_round_f32(float);
+extern __attribute__((pure)) float __hsail_floor_f32(float);
+extern __attribute__((pure)) float __hsail_ceil_f32(float);
+extern __attribute__((pure)) float __hsail_trunc_f32(float);
+extern __attribute__((pure)) float __hsail_abs_f32(float);
+
+extern __attribute__((pure)) int  __hsail_min_s32(int, int);
+extern __attribute__((pure)) int  __hsail_max_s32(int, int);
+extern __attribute__((pure)) uint __hsail_min_u32(uint, int);
+extern __attribute__((pure)) uint __hsail_max_u32(uint, uint);
+extern __attribute__((pure)) int  __hsail_mulhi_s32(int, int);
+extern __attribute__((pure)) uint __hsail_mulhi_u32(uint, uint);
+extern __attribute__((pure)) int  __hsail_mulhi_s64(int, int);
+extern __attribute__((pure)) uint __hsail_mulhi_u64(uint, uint);
+
+// HSAIL intrinsic functions used by math64 functions
+extern __attribute__((pure)) double __hsail_fma_f64(double, double, double);
+extern __attribute__((pure)) double __hsail_nfma_f64(double, double, double);
+extern __attribute__((pure)) double __hsail_max_f64(double, double);
+extern __attribute__((pure)) double __hsail_min_f64(double, double);
+extern __attribute__((pure)) double __hsail_round_f64(double);
+extern __attribute__((pure)) double __hsail_floor_f64(double);
+extern __attribute__((pure)) double __hsail_ceil_f64(double);
+extern __attribute__((pure)) double __hsail_trunc_f64(double);
+extern __attribute__((pure)) double __hsail_abs_f64(double);
+extern __attribute__((pure)) double __hsail_nrsqrt_f64(double);
+extern __attribute__((pure)) double __hsail_nsqrt_f64(double);
+
+extern __attribute__((pure)) uint __hsail_mad_u32(uint, uint, uint);
+
+// HSAIL conversion intrinsics
+extern __attribute__((pure)) float __cvt_f32_f16(uint op1);
+
+extern __attribute__((pure)) float __cvt_f16_rtz_f32(float op1);
+extern __attribute__((pure)) float __cvt_f16_rte_f32(float op1);
+extern __attribute__((pure)) float __cvt_f16_rtn_f32(float op1);
+extern __attribute__((pure)) float __cvt_f16_rtp_f32(float op1);
+
+extern __attribute__((pure)) float __cvt_f16_rtz_f64(double op1);
+extern __attribute__((pure)) float __cvt_f16_rte_f64(double op1);
+extern __attribute__((pure)) float __cvt_f16_rtn_f64(double op1);
+extern __attribute__((pure)) float __cvt_f16_rtp_f64(double op1);
+
+// Misc HSAIL intrinsic functions
+extern __attribute__((const)) uint __hsail_bitselect_u32(uint, uint, uint);
+extern __attribute__((pure)) int  __hsail_class_f32(float, int);
+extern __attribute__((pure)) int  __hsail_class_f64(double, int);
+extern __attribute__((pure)) int  __hsail_mad24_s32(int, int, int);
+extern __attribute__((pure)) uint __hsail_mad24_u32(uint, uint, uint);
+extern __attribute__((pure)) int  __hsail_mul24_s32(int, int);
+extern __attribute__((pure)) uint __hsail_mul24_u32(uint, uint);
+
+extern __attribute__((pure)) int __hsail_popcount_u32_b32(int);
+
+extern __attribute__((pure)) int __hsail_firstbit_u32(uint);
+
+extern __attribute__((pure)) float  __hsail_fraction_f32(float);
+extern __attribute__((pure)) double __hsail_fraction_f64(double);
+
+// __amdil_ math32 function defs
+
+__attribute__((weak,always_inline)) float
+__amdil_div_f32(float x, float y) {
+  return native_divide(x, y);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_fma_f32(float x, float y, float z) {
+  return __hsail_fma_f32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_mad_f32(float x, float y, float z) {
+  return __hsail_nfma_f32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_min_f32(float x, float y) {
+  return __hsail_min_f32(x, y);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_max_f32(float x, float y) {
+  return __hsail_max_f32(x, y);
+}
+
+__attribute__((weak,always_inline)) float
+__ftz_f32(float x) {
+  return __hsail_ftz_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_nearest_f32(float x) {
+  return __hsail_round_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_neginf_f32(float x) {
+  return __hsail_floor_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_posinf_f32(float x) {
+  return __hsail_ceil_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_zero_f32(float x) {
+  return __hsail_trunc_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_fabs_f32(float x) {
+  return __hsail_abs_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_improved_div_f32(float x, float y) {
+  return native_divide(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_imin_i32(int x, int y) {
+  return __hsail_min_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_imax_i32(int x, int y) {
+  return __hsail_max_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umin_u32(uint x, uint y) {
+  return __hsail_min_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umax_u32(uint x, uint y) {
+  return __hsail_max_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_imul_high_i32(int x, int y) {
+  return __hsail_mulhi_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umul_high_u32(uint x, uint y) {
+  return __hsail_mulhi_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint 
+__amdil_umad_u32(uint x, uint y, uint z) {
+  return __hsail_mad_u32(x, y, z);
+}
+
+// __amdil_ math64 function defs
+
+__attribute__((weak,always_inline)) double
+__amdil_fma_f64(double x, double y, double z) {
+  return __hsail_fma_f64(x, y, z);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_mad_f64(double x, double y, double z) {
+  return __hsail_nfma_f64(x, y, z);
+}
+
+ __attribute__((weak,always_inline)) double
+__amdil_max_f64(double x, double y) {
+  return __hsail_max_f64(x, y);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_nearest_f64(double x) {
+  return __hsail_round_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_neginf_f64(double x) {
+  return __hsail_floor_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_posinf_f64(double x) {
+  return __hsail_ceil_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_zero_f64(double x) {
+  return __hsail_trunc_f64(x);
+}
+
+ __attribute__((weak,always_inline)) double
+__amdil_min_f64(double x, double y) {
+  return __hsail_min_f64(x, y);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_fabs_f64(double x) {
+  return __hsail_abs_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_sqrt_f64(double x) {
+  return __hsail_nsqrt_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_rsq_f64(double x) {
+  return __hsail_nrsqrt_f64(x);
+}
+
+// __amdil conversion functions
+
+__attribute__((weak,always_inline)) float 
+__amdil_half_to_float_f32(uint x) {
+  return __cvt_f32_f16(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_float_to_half_f32(float x) {
+  return __cvt_f16_rtz_f32(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_float_to_half_near_f32(float x) {
+  return __cvt_f16_rte_f32(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_float_to_half_neg_inf_f32(float x) {
+  return __cvt_f16_rtn_f32(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_float_to_half_plus_inf_f32(float x) {
+  return __cvt_f16_rtp_f32(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_double_to_half_f64(double x) {
+  return __cvt_f16_rtz_f64(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_double_to_half_near_f64(double x) {
+  return __cvt_f16_rte_f64(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_double_to_half_neg_inf_f64(double x) {
+  return __cvt_f16_rtn_f64(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_double_to_half_plus_inf_f64(double x) {
+  return __cvt_f16_rtp_f64(x);
+}
+
+// Misc __amdil_ function defs
+
+__attribute__((weak,always_inline)) uint
+__amdil_bfi_u32(uint x, uint y, uint z) {
+  return __hsail_bitselect_u32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_class_f32(float x, int y) {
+  int cval = __hsail_class_f32(x, y);
+  int ret = (cval & 0x1) ? (0xffffffffU) : 0;
+  return ret;
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_class_f64(double x, int y) {
+  int cval = __hsail_class_f64(x, y);
+  int ret = (cval & 0x1) ? (0xffffffffU) : 0;
+  return ret;
+}
+
+__attribute__((weak,always_inline)) int 
+__amdil_imad24_i32(int x, int y, int z) {
+  return __hsail_mad24_s32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) uint 
+__amdil_umad24_u32(uint x, uint y, uint z) {
+  return __hsail_mad24_u32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) int 
+__amdil_imul24_i32(int x, int y) {
+  return __hsail_mul24_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint 
+__amdil_umul24_u32(uint x, uint y) {
+  return __hsail_mul24_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_count_bits_i32(int x) {
+  return __hsail_popcount_u32_b32(x);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_ffb_hi_u32(uint x) {
+  return __hsail_firstbit_u32(x);
+}
+
+//#ifdef HSAIL_SPEC_CURRENT
+__attribute__((weak,always_inline)) float
+__amdil_fraction_f32(float x) {
+  return __hsail_fraction_f32(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_fraction_f64(double x) {
+  return __hsail_fraction_f64(x);
+}
+//#endif 
+

diff --git a/amd-builtins/misc/atomicWorkItemFence.cl b/amd-builtins/misc/atomicWorkItemFence.cl
new file mode 100644
index 0000000..6ea86a0
--- /dev/null
+++ b/amd-builtins/misc/atomicWorkItemFence.cl

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#if __OPENCL_C_VERSION__ >= 200
+extern void __atomic_memfence(uint flags, uint mo, uint msc);
+enum BrigMemoryFenceSegments {
+  BRIG_MEMORY_FENCE_NONE   = 0,
+  BRIG_MEMORY_FENCE_GROUP  = 1,
+  BRIG_MEMORY_FENCE_GLOBAL = 2,
+  BRIG_MEMORY_FENCE_BOTH   = 3,
+  BRIG_MEMORY_FENCE_IMAGE  = 4
+};
+
+enum BrigMemoryOrder {
+  BRIG_MEMORY_ORDER_NONE = 0,
+  BRIG_MEMORY_ORDER_RELAXED = 1,
+  BRIG_MEMORY_ORDER_ACQUIRE = 2,
+  BRIG_MEMORY_ORDER_RELEASE = 3,
+  BRIG_MEMORY_ORDER_ACQUIRE_RELEASE = 4
+};
+
+enum BrigMemoryScope {
+  BRIG_MEMORY_SCOPE_NONE = 0,
+  BRIG_MEMORY_SCOPE_WAVEFRONT = 1,
+  BRIG_MEMORY_SCOPE_WORKGROUP = 2,
+  BRIG_MEMORY_SCOPE_COMPONENT = 3,
+  BRIG_MEMORY_SCOPE_SYSTEM = 4,
+  BRIG_MEMORY_SCOPE_WORKITEM = 5
+};
+
+static inline uint getBrigMemoryOrder(memory_order mo) {
+  switch(mo) {
+    default : return BRIG_MEMORY_ORDER_NONE;
+    case memory_order_relaxed : return BRIG_MEMORY_ORDER_RELAXED;
+    case memory_order_release : return BRIG_MEMORY_ORDER_RELEASE;
+    case memory_order_acquire : return BRIG_MEMORY_ORDER_ACQUIRE;
+    case memory_order_acq_rel :
+    case memory_order_seq_cst : return BRIG_MEMORY_ORDER_ACQUIRE_RELEASE;
+  }
+}
+
+static inline uint getBrigMemoryScope(memory_scope msc) {
+  switch(msc) {
+    default :  return BRIG_MEMORY_SCOPE_NONE;
+    case memory_scope_work_group : return BRIG_MEMORY_SCOPE_WORKGROUP;
+    case memory_scope_device : return BRIG_MEMORY_SCOPE_COMPONENT;
+    case memory_scope_all_svm_devices : return BRIG_MEMORY_SCOPE_SYSTEM;
+    case memory_scope_sub_group : return BRIG_MEMORY_SCOPE_WAVEFRONT;
+    case memory_scope_work_item : return BRIG_MEMORY_SCOPE_WORKITEM;
+  }
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) void
+atomic_work_item_fence(/*cl_mem_fence_flags*/ unsigned flag, memory_order mo, memory_scope msc) {
+  uint brigSegment = 0;
+  uint brigMemoryOrder = getBrigMemoryOrder(mo);
+  uint brigMemoryScope = BRIG_MEMORY_SCOPE_WORKGROUP;
+  // relaxed fence has no effect
+  if (mo == memory_order_relaxed) return;
+  if ((flag & CLK_GLOBAL_MEM_FENCE) && (flag & CLK_LOCAL_MEM_FENCE)) {
+    brigSegment = BRIG_MEMORY_FENCE_BOTH;
+    brigMemoryScope = getBrigMemoryScope(msc);
+  }
+  else if (flag & CLK_GLOBAL_MEM_FENCE) {
+    brigSegment = BRIG_MEMORY_FENCE_GLOBAL;
+    brigMemoryScope = getBrigMemoryScope(msc);
+  }
+  else if (flag & CLK_LOCAL_MEM_FENCE) {
+    brigSegment = BRIG_MEMORY_FENCE_GROUP;
+  }
+  if (brigSegment != 0) {
+    __atomic_memfence(brigSegment, brigMemoryOrder, brigMemoryScope);
+  }
+  if (flag & CLK_IMAGE_MEM_FENCE) {
+    brigMemoryScope = getBrigMemoryScope(msc);
+    __atomic_memfence(BRIG_MEMORY_FENCE_IMAGE, BRIG_MEMORY_ORDER_ACQUIRE_RELEASE, brigMemoryScope);
+  }
+}
+#endif // __OPENCL_C_VERSION__ >= 200

diff --git a/amd-builtins/misc/awgcpy.cl b/amd-builtins/misc/awgcpy.cl
new file mode 100644
index 0000000..6a5f302
--- /dev/null
+++ b/amd-builtins/misc/awgcpy.cl

@@ -0,0 +1,2696 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern __attribute__((pure)) int __hsail_workitemid_flat(void);
+
+__attribute__((always_inline)) static event_t
+__AWGClgI1(__local uchar * dst, const __global uchar * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI1"))) event_t async_work_group_copy(__local uchar *, const __global uchar *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI1"))) event_t async_work_group_copy(__local char *, const __global char *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI1(__local uchar *dst, const __global uchar *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI1"))) event_t async_work_group_strided_copy(__local uchar *, const __global uchar *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI1"))) event_t async_work_group_strided_copy(__local char *, const __global char *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI1(__global uchar * dst, const __local uchar * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI1"))) event_t async_work_group_copy(__global uchar *, const __local uchar *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI1"))) event_t async_work_group_copy(__global char *, const __local char *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI1(__global uchar *dst, const __local uchar *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI1"))) event_t async_work_group_strided_copy(__global uchar *, const __local uchar *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI1"))) event_t async_work_group_strided_copy(__global char *, const __local char *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClgI2(__local ushort * dst, const __global ushort * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI2"))) event_t async_work_group_copy(__local ushort *, const __global ushort *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI2"))) event_t async_work_group_copy(__local short *, const __global short *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI2(__local ushort *dst, const __global ushort *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI2"))) event_t async_work_group_strided_copy(__local ushort *, const __global ushort *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI2"))) event_t async_work_group_strided_copy(__local short *, const __global short *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI2(__global ushort * dst, const __local ushort * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI2"))) event_t async_work_group_copy(__global ushort *, const __local ushort *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI2"))) event_t async_work_group_copy(__global short *, const __local short *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI2(__global ushort *dst, const __local ushort *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI2"))) event_t async_work_group_strided_copy(__global ushort *, const __local ushort *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI2"))) event_t async_work_group_strided_copy(__global short *, const __local short *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClgI4(__local uint * dst, const __global uint * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI4"))) event_t async_work_group_copy(__local uint *, const __global uint *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI4"))) event_t async_work_group_copy(__local int *, const __global int *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI4(__local uint *dst, const __global uint *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI4"))) event_t async_work_group_strided_copy(__local uint *, const __global uint *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI4"))) event_t async_work_group_strided_copy(__local int *, const __global int *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI4(__global uint * dst, const __local uint * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI4"))) event_t async_work_group_copy(__global uint *, const __local uint *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI4"))) event_t async_work_group_copy(__global int *, const __local int *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI4(__global uint *dst, const __local uint *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI4"))) event_t async_work_group_strided_copy(__global uint *, const __local uint *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI4"))) event_t async_work_group_strided_copy(__global int *, const __local int *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClgI8(__local ulong * dst, const __global ulong * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI8"))) event_t async_work_group_copy(__local ulong *, const __global ulong *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI8"))) event_t async_work_group_copy(__local long *, const __global long *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI8(__local ulong *dst, const __global ulong *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI8"))) event_t async_work_group_strided_copy(__local ulong *, const __global ulong *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI8"))) event_t async_work_group_strided_copy(__local long *, const __global long *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI8(__global ulong * dst, const __local ulong * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI8"))) event_t async_work_group_copy(__global ulong *, const __local ulong *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI8"))) event_t async_work_group_copy(__global long *, const __local long *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI8(__global ulong *dst, const __local ulong *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI8"))) event_t async_work_group_strided_copy(__global ulong *, const __local ulong *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI8"))) event_t async_work_group_strided_copy(__global long *, const __local long *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float * dst, const __global float * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float *dst, const __global float *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float * dst, const __local float * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float *dst, const __local float *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double * dst, const __global double * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double *dst, const __global double *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double * dst, const __local double * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double *dst, const __local double *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I1(__local uchar2 * dst, const __global uchar2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I1"))) event_t async_work_group_copy(__local uchar2 *, const __global uchar2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I1"))) event_t async_work_group_copy(__local char2 *, const __global char2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I1(__local uchar2 *dst, const __global uchar2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I1"))) event_t async_work_group_strided_copy(__local uchar2 *, const __global uchar2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I1"))) event_t async_work_group_strided_copy(__local char2 *, const __global char2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I1(__global uchar2 * dst, const __local uchar2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I1"))) event_t async_work_group_copy(__global uchar2 *, const __local uchar2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I1"))) event_t async_work_group_copy(__global char2 *, const __local char2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I1(__global uchar2 *dst, const __local uchar2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I1"))) event_t async_work_group_strided_copy(__global uchar2 *, const __local uchar2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I1"))) event_t async_work_group_strided_copy(__global char2 *, const __local char2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I2(__local ushort2 * dst, const __global ushort2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I2"))) event_t async_work_group_copy(__local ushort2 *, const __global ushort2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I2"))) event_t async_work_group_copy(__local short2 *, const __global short2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I2(__local ushort2 *dst, const __global ushort2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I2"))) event_t async_work_group_strided_copy(__local ushort2 *, const __global ushort2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I2"))) event_t async_work_group_strided_copy(__local short2 *, const __global short2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I2(__global ushort2 * dst, const __local ushort2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I2"))) event_t async_work_group_copy(__global ushort2 *, const __local ushort2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I2"))) event_t async_work_group_copy(__global short2 *, const __local short2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I2(__global ushort2 *dst, const __local ushort2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I2"))) event_t async_work_group_strided_copy(__global ushort2 *, const __local ushort2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I2"))) event_t async_work_group_strided_copy(__global short2 *, const __local short2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I4(__local uint2 * dst, const __global uint2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I4"))) event_t async_work_group_copy(__local uint2 *, const __global uint2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I4"))) event_t async_work_group_copy(__local int2 *, const __global int2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I4(__local uint2 *dst, const __global uint2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I4"))) event_t async_work_group_strided_copy(__local uint2 *, const __global uint2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I4"))) event_t async_work_group_strided_copy(__local int2 *, const __global int2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I4(__global uint2 * dst, const __local uint2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I4"))) event_t async_work_group_copy(__global uint2 *, const __local uint2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I4"))) event_t async_work_group_copy(__global int2 *, const __local int2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I4(__global uint2 *dst, const __local uint2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I4"))) event_t async_work_group_strided_copy(__global uint2 *, const __local uint2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I4"))) event_t async_work_group_strided_copy(__global int2 *, const __local int2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I8(__local ulong2 * dst, const __global ulong2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I8"))) event_t async_work_group_copy(__local ulong2 *, const __global ulong2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I8"))) event_t async_work_group_copy(__local long2 *, const __global long2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I8(__local ulong2 *dst, const __global ulong2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I8"))) event_t async_work_group_strided_copy(__local ulong2 *, const __global ulong2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I8"))) event_t async_work_group_strided_copy(__local long2 *, const __global long2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I8(__global ulong2 * dst, const __local ulong2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I8"))) event_t async_work_group_copy(__global ulong2 *, const __local ulong2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I8"))) event_t async_work_group_copy(__global long2 *, const __local long2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I8(__global ulong2 *dst, const __local ulong2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I8"))) event_t async_work_group_strided_copy(__global ulong2 *, const __local ulong2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I8"))) event_t async_work_group_strided_copy(__global long2 *, const __local long2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float2 * dst, const __global float2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float2 *dst, const __global float2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float2 * dst, const __local float2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float2 *dst, const __local float2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double2 * dst, const __global double2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double2 *dst, const __global double2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double2 * dst, const __local double2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double2 *dst, const __local double2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I1(__local uchar3 * dst, const __global uchar3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I1"))) event_t async_work_group_copy(__local uchar3 *, const __global uchar3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I1"))) event_t async_work_group_copy(__local char3 *, const __global char3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I1(__local uchar3 *dst, const __global uchar3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I1"))) event_t async_work_group_strided_copy(__local uchar3 *, const __global uchar3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I1"))) event_t async_work_group_strided_copy(__local char3 *, const __global char3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I1(__global uchar3 * dst, const __local uchar3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I1"))) event_t async_work_group_copy(__global uchar3 *, const __local uchar3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I1"))) event_t async_work_group_copy(__global char3 *, const __local char3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I1(__global uchar3 *dst, const __local uchar3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I1"))) event_t async_work_group_strided_copy(__global uchar3 *, const __local uchar3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I1"))) event_t async_work_group_strided_copy(__global char3 *, const __local char3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I2(__local ushort3 * dst, const __global ushort3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I2"))) event_t async_work_group_copy(__local ushort3 *, const __global ushort3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I2"))) event_t async_work_group_copy(__local short3 *, const __global short3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I2(__local ushort3 *dst, const __global ushort3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I2"))) event_t async_work_group_strided_copy(__local ushort3 *, const __global ushort3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I2"))) event_t async_work_group_strided_copy(__local short3 *, const __global short3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I2(__global ushort3 * dst, const __local ushort3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I2"))) event_t async_work_group_copy(__global ushort3 *, const __local ushort3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I2"))) event_t async_work_group_copy(__global short3 *, const __local short3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I2(__global ushort3 *dst, const __local ushort3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I2"))) event_t async_work_group_strided_copy(__global ushort3 *, const __local ushort3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I2"))) event_t async_work_group_strided_copy(__global short3 *, const __local short3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I4(__local uint3 * dst, const __global uint3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I4"))) event_t async_work_group_copy(__local uint3 *, const __global uint3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I4"))) event_t async_work_group_copy(__local int3 *, const __global int3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I4(__local uint3 *dst, const __global uint3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I4"))) event_t async_work_group_strided_copy(__local uint3 *, const __global uint3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I4"))) event_t async_work_group_strided_copy(__local int3 *, const __global int3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I4(__global uint3 * dst, const __local uint3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I4"))) event_t async_work_group_copy(__global uint3 *, const __local uint3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I4"))) event_t async_work_group_copy(__global int3 *, const __local int3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I4(__global uint3 *dst, const __local uint3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I4"))) event_t async_work_group_strided_copy(__global uint3 *, const __local uint3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I4"))) event_t async_work_group_strided_copy(__global int3 *, const __local int3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I8(__local ulong3 * dst, const __global ulong3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I8"))) event_t async_work_group_copy(__local ulong3 *, const __global ulong3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I8"))) event_t async_work_group_copy(__local long3 *, const __global long3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I8(__local ulong3 *dst, const __global ulong3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I8"))) event_t async_work_group_strided_copy(__local ulong3 *, const __global ulong3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I8"))) event_t async_work_group_strided_copy(__local long3 *, const __global long3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I8(__global ulong3 * dst, const __local ulong3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I8"))) event_t async_work_group_copy(__global ulong3 *, const __local ulong3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I8"))) event_t async_work_group_copy(__global long3 *, const __local long3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I8(__global ulong3 *dst, const __local ulong3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I8"))) event_t async_work_group_strided_copy(__global ulong3 *, const __local ulong3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I8"))) event_t async_work_group_strided_copy(__global long3 *, const __local long3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float3 * dst, const __global float3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float3 *dst, const __global float3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float3 * dst, const __local float3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float3 *dst, const __local float3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double3 * dst, const __global double3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double3 *dst, const __global double3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double3 * dst, const __local double3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double3 *dst, const __local double3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I1(__local uchar4 * dst, const __global uchar4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I1"))) event_t async_work_group_copy(__local uchar4 *, const __global uchar4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I1"))) event_t async_work_group_copy(__local char4 *, const __global char4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I1(__local uchar4 *dst, const __global uchar4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I1"))) event_t async_work_group_strided_copy(__local uchar4 *, const __global uchar4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I1"))) event_t async_work_group_strided_copy(__local char4 *, const __global char4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I1(__global uchar4 * dst, const __local uchar4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I1"))) event_t async_work_group_copy(__global uchar4 *, const __local uchar4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I1"))) event_t async_work_group_copy(__global char4 *, const __local char4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I1(__global uchar4 *dst, const __local uchar4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I1"))) event_t async_work_group_strided_copy(__global uchar4 *, const __local uchar4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I1"))) event_t async_work_group_strided_copy(__global char4 *, const __local char4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I2(__local ushort4 * dst, const __global ushort4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I2"))) event_t async_work_group_copy(__local ushort4 *, const __global ushort4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I2"))) event_t async_work_group_copy(__local short4 *, const __global short4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I2(__local ushort4 *dst, const __global ushort4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I2"))) event_t async_work_group_strided_copy(__local ushort4 *, const __global ushort4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I2"))) event_t async_work_group_strided_copy(__local short4 *, const __global short4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I2(__global ushort4 * dst, const __local ushort4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I2"))) event_t async_work_group_copy(__global ushort4 *, const __local ushort4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I2"))) event_t async_work_group_copy(__global short4 *, const __local short4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I2(__global ushort4 *dst, const __local ushort4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I2"))) event_t async_work_group_strided_copy(__global ushort4 *, const __local ushort4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I2"))) event_t async_work_group_strided_copy(__global short4 *, const __local short4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I4(__local uint4 * dst, const __global uint4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I4"))) event_t async_work_group_copy(__local uint4 *, const __global uint4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I4"))) event_t async_work_group_copy(__local int4 *, const __global int4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I4(__local uint4 *dst, const __global uint4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I4"))) event_t async_work_group_strided_copy(__local uint4 *, const __global uint4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I4"))) event_t async_work_group_strided_copy(__local int4 *, const __global int4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I4(__global uint4 * dst, const __local uint4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I4"))) event_t async_work_group_copy(__global uint4 *, const __local uint4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I4"))) event_t async_work_group_copy(__global int4 *, const __local int4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I4(__global uint4 *dst, const __local uint4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I4"))) event_t async_work_group_strided_copy(__global uint4 *, const __local uint4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I4"))) event_t async_work_group_strided_copy(__global int4 *, const __local int4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I8(__local ulong4 * dst, const __global ulong4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I8"))) event_t async_work_group_copy(__local ulong4 *, const __global ulong4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I8"))) event_t async_work_group_copy(__local long4 *, const __global long4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I8(__local ulong4 *dst, const __global ulong4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I8"))) event_t async_work_group_strided_copy(__local ulong4 *, const __global ulong4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I8"))) event_t async_work_group_strided_copy(__local long4 *, const __global long4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I8(__global ulong4 * dst, const __local ulong4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I8"))) event_t async_work_group_copy(__global ulong4 *, const __local ulong4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I8"))) event_t async_work_group_copy(__global long4 *, const __local long4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I8(__global ulong4 *dst, const __local ulong4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I8"))) event_t async_work_group_strided_copy(__global ulong4 *, const __local ulong4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I8"))) event_t async_work_group_strided_copy(__global long4 *, const __local long4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float4 * dst, const __global float4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float4 *dst, const __global float4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float4 * dst, const __local float4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float4 *dst, const __local float4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double4 * dst, const __global double4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double4 *dst, const __global double4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double4 * dst, const __local double4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double4 *dst, const __local double4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I1(__local uchar8 * dst, const __global uchar8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I1"))) event_t async_work_group_copy(__local uchar8 *, const __global uchar8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I1"))) event_t async_work_group_copy(__local char8 *, const __global char8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I1(__local uchar8 *dst, const __global uchar8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I1"))) event_t async_work_group_strided_copy(__local uchar8 *, const __global uchar8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I1"))) event_t async_work_group_strided_copy(__local char8 *, const __global char8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I1(__global uchar8 * dst, const __local uchar8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I1"))) event_t async_work_group_copy(__global uchar8 *, const __local uchar8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I1"))) event_t async_work_group_copy(__global char8 *, const __local char8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I1(__global uchar8 *dst, const __local uchar8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I1"))) event_t async_work_group_strided_copy(__global uchar8 *, const __local uchar8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I1"))) event_t async_work_group_strided_copy(__global char8 *, const __local char8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I2(__local ushort8 * dst, const __global ushort8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I2"))) event_t async_work_group_copy(__local ushort8 *, const __global ushort8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I2"))) event_t async_work_group_copy(__local short8 *, const __global short8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I2(__local ushort8 *dst, const __global ushort8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I2"))) event_t async_work_group_strided_copy(__local ushort8 *, const __global ushort8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I2"))) event_t async_work_group_strided_copy(__local short8 *, const __global short8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I2(__global ushort8 * dst, const __local ushort8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I2"))) event_t async_work_group_copy(__global ushort8 *, const __local ushort8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I2"))) event_t async_work_group_copy(__global short8 *, const __local short8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I2(__global ushort8 *dst, const __local ushort8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I2"))) event_t async_work_group_strided_copy(__global ushort8 *, const __local ushort8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I2"))) event_t async_work_group_strided_copy(__global short8 *, const __local short8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I4(__local uint8 * dst, const __global uint8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I4"))) event_t async_work_group_copy(__local uint8 *, const __global uint8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I4"))) event_t async_work_group_copy(__local int8 *, const __global int8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I4(__local uint8 *dst, const __global uint8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I4"))) event_t async_work_group_strided_copy(__local uint8 *, const __global uint8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I4"))) event_t async_work_group_strided_copy(__local int8 *, const __global int8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I4(__global uint8 * dst, const __local uint8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I4"))) event_t async_work_group_copy(__global uint8 *, const __local uint8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I4"))) event_t async_work_group_copy(__global int8 *, const __local int8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I4(__global uint8 *dst, const __local uint8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I4"))) event_t async_work_group_strided_copy(__global uint8 *, const __local uint8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I4"))) event_t async_work_group_strided_copy(__global int8 *, const __local int8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I8(__local ulong8 * dst, const __global ulong8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I8"))) event_t async_work_group_copy(__local ulong8 *, const __global ulong8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I8"))) event_t async_work_group_copy(__local long8 *, const __global long8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I8(__local ulong8 *dst, const __global ulong8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I8"))) event_t async_work_group_strided_copy(__local ulong8 *, const __global ulong8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I8"))) event_t async_work_group_strided_copy(__local long8 *, const __global long8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I8(__global ulong8 * dst, const __local ulong8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I8"))) event_t async_work_group_copy(__global ulong8 *, const __local ulong8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I8"))) event_t async_work_group_copy(__global long8 *, const __local long8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I8(__global ulong8 *dst, const __local ulong8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I8"))) event_t async_work_group_strided_copy(__global ulong8 *, const __local ulong8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I8"))) event_t async_work_group_strided_copy(__global long8 *, const __local long8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float8 * dst, const __global float8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float8 *dst, const __global float8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float8 * dst, const __local float8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float8 *dst, const __local float8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double8 * dst, const __global double8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double8 *dst, const __global double8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double8 * dst, const __local double8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double8 *dst, const __local double8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I1(__local uchar16 * dst, const __global uchar16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I1"))) event_t async_work_group_copy(__local uchar16 *, const __global uchar16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I1"))) event_t async_work_group_copy(__local char16 *, const __global char16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I1(__local uchar16 *dst, const __global uchar16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I1"))) event_t async_work_group_strided_copy(__local uchar16 *, const __global uchar16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I1"))) event_t async_work_group_strided_copy(__local char16 *, const __global char16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I1(__global uchar16 * dst, const __local uchar16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I1"))) event_t async_work_group_copy(__global uchar16 *, const __local uchar16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I1"))) event_t async_work_group_copy(__global char16 *, const __local char16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I1(__global uchar16 *dst, const __local uchar16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I1"))) event_t async_work_group_strided_copy(__global uchar16 *, const __local uchar16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I1"))) event_t async_work_group_strided_copy(__global char16 *, const __local char16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I2(__local ushort16 * dst, const __global ushort16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I2"))) event_t async_work_group_copy(__local ushort16 *, const __global ushort16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I2"))) event_t async_work_group_copy(__local short16 *, const __global short16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I2(__local ushort16 *dst, const __global ushort16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I2"))) event_t async_work_group_strided_copy(__local ushort16 *, const __global ushort16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I2"))) event_t async_work_group_strided_copy(__local short16 *, const __global short16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I2(__global ushort16 * dst, const __local ushort16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I2"))) event_t async_work_group_copy(__global ushort16 *, const __local ushort16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I2"))) event_t async_work_group_copy(__global short16 *, const __local short16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I2(__global ushort16 *dst, const __local ushort16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I2"))) event_t async_work_group_strided_copy(__global ushort16 *, const __local ushort16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I2"))) event_t async_work_group_strided_copy(__global short16 *, const __local short16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I4(__local uint16 * dst, const __global uint16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I4"))) event_t async_work_group_copy(__local uint16 *, const __global uint16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I4"))) event_t async_work_group_copy(__local int16 *, const __global int16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I4(__local uint16 *dst, const __global uint16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I4"))) event_t async_work_group_strided_copy(__local uint16 *, const __global uint16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I4"))) event_t async_work_group_strided_copy(__local int16 *, const __global int16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I4(__global uint16 * dst, const __local uint16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I4"))) event_t async_work_group_copy(__global uint16 *, const __local uint16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I4"))) event_t async_work_group_copy(__global int16 *, const __local int16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I4(__global uint16 *dst, const __local uint16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I4"))) event_t async_work_group_strided_copy(__global uint16 *, const __local uint16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I4"))) event_t async_work_group_strided_copy(__global int16 *, const __local int16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I8(__local ulong16 * dst, const __global ulong16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I8"))) event_t async_work_group_copy(__local ulong16 *, const __global ulong16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I8"))) event_t async_work_group_copy(__local long16 *, const __global long16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I8(__local ulong16 *dst, const __global ulong16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I8"))) event_t async_work_group_strided_copy(__local ulong16 *, const __global ulong16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I8"))) event_t async_work_group_strided_copy(__local long16 *, const __global long16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I8(__global ulong16 * dst, const __local ulong16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I8"))) event_t async_work_group_copy(__global ulong16 *, const __local ulong16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I8"))) event_t async_work_group_copy(__global long16 *, const __local long16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I8(__global ulong16 *dst, const __local ulong16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I8"))) event_t async_work_group_strided_copy(__global ulong16 *, const __local ulong16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I8"))) event_t async_work_group_strided_copy(__global long16 *, const __local long16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float16 * dst, const __global float16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float16 *dst, const __global float16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float16 * dst, const __local float16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float16 *dst, const __local float16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double16 * dst, const __global double16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double16 *dst, const __global double16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double16 * dst, const __local double16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double16 *dst, const __local double16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double16 *p, size_t n)
+{
+    // nothing to do
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) void
+wait_group_events(int num_events, event_t *event_list)
+{
+    // Nothing to do
+}

diff --git a/amd-builtins/misc/bitsel.cl b/amd-builtins/misc/bitsel.cl
new file mode 100644
index 0000000..2a12ffc
--- /dev/null
+++ b/amd-builtins/misc/bitsel.cl

@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern __attribute__((pure)) uint __amdil_bfi_u32(uint, uint, uint);
+
+// [u]int
+
+__attribute__((always_inline)) static uint
+__BSELI4(uint a, uint b, uint c)
+{
+    return __amdil_bfi_u32(c, b, a);
+}
+
+extern __attribute__((overloadable, alias("__BSELI4"))) uint bitselect(uint, uint, uint);
+extern __attribute__((overloadable, alias("__BSELI4"))) int bitselect(int, int, int);
+
+// float
+
+__attribute__((overloadable, always_inline)) float
+bitselect(float a, float b, float c)
+{
+    return as_float(__amdil_bfi_u32(as_uint(c), as_uint(b), as_uint(a)));
+}
+
+// [u]long
+
+// No __amdil equivalent, so use __hsail intrinsic here
+extern __attribute__((const)) ulong __hsail_bitselect_u64(ulong, ulong, ulong);
+
+__attribute__((always_inline)) static ulong
+__BSELI8(ulong a, ulong b, ulong c)
+{
+    return __hsail_bitselect_u64(c, b, a);
+}
+
+extern __attribute__((overloadable, alias("__BSELI8"))) ulong bitselect(ulong, ulong, ulong);
+extern __attribute__((overloadable, alias("__BSELI8"))) long bitselect(long, long, long);
+
+// double
+
+__attribute__((overloadable, always_inline)) double
+bitselect(double a, double b, double c)
+{
+    return as_double(__hsail_bitselect_u64(as_ulong(c), as_ulong(b), as_ulong(a)));
+}
+

diff --git a/amd-builtins/misc/class.cl b/amd-builtins/misc/class.cl
new file mode 100644
index 0000000..17f593e
--- /dev/null
+++ b/amd-builtins/misc/class.cl

@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#define SNAN 0x001
+#define QNAN 0x002
+#define NINF 0x004
+#define NNOR 0x008
+#define NSUB 0x010
+#define NZER 0x020
+#define PZER 0x040
+#define PSUB 0x080
+#define PNOR 0x100
+#define PINF 0x200
+
+extern __attribute__((pure)) int __amdil_class_f32(float, int);
+extern __attribute__((pure)) int __amdil_class_f64(double, int);
+
+#define FC(F,M) \
+__attribute__((overloadable, always_inline)) int \
+F(float x) \
+{ \
+    return __amdil_class_f32(x, M) & 1; \
+} \
+__attribute__((overloadable, always_inline)) int2 \
+F(float2 x) \
+{ \
+    int2 ret; \
+    ret.s0 = __amdil_class_f32(x.s0, M); \
+    ret.s1 = __amdil_class_f32(x.s1, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) int3 \
+F(float3 x) \
+{ \
+    int3 ret; \
+    ret.s0 = __amdil_class_f32(x.s0, M); \
+    ret.s1 = __amdil_class_f32(x.s1, M); \
+    ret.s2 = __amdil_class_f32(x.s2, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) int4 \
+F(float4 x) \
+{ \
+    int4 ret; \
+    ret.s0 = __amdil_class_f32(x.s0, M); \
+    ret.s1 = __amdil_class_f32(x.s1, M); \
+    ret.s2 = __amdil_class_f32(x.s2, M); \
+    ret.s3 = __amdil_class_f32(x.s3, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) int8 \
+F(float8 x) \
+{ \
+    int8 ret; \
+    ret.s0 = __amdil_class_f32(x.s0, M); \
+    ret.s1 = __amdil_class_f32(x.s1, M); \
+    ret.s2 = __amdil_class_f32(x.s2, M); \
+    ret.s3 = __amdil_class_f32(x.s3, M); \
+    ret.s4 = __amdil_class_f32(x.s4, M); \
+    ret.s5 = __amdil_class_f32(x.s5, M); \
+    ret.s6 = __amdil_class_f32(x.s6, M); \
+    ret.s7 = __amdil_class_f32(x.s7, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) int16 \
+F(float16 x) \
+{ \
+    int16 ret; \
+    ret.s0 = __amdil_class_f32(x.s0, M); \
+    ret.s1 = __amdil_class_f32(x.s1, M); \
+    ret.s2 = __amdil_class_f32(x.s2, M); \
+    ret.s3 = __amdil_class_f32(x.s3, M); \
+    ret.s4 = __amdil_class_f32(x.s4, M); \
+    ret.s5 = __amdil_class_f32(x.s5, M); \
+    ret.s6 = __amdil_class_f32(x.s6, M); \
+    ret.s7 = __amdil_class_f32(x.s7, M); \
+    ret.s8 = __amdil_class_f32(x.s8, M); \
+    ret.s9 = __amdil_class_f32(x.s9, M); \
+    ret.sa = __amdil_class_f32(x.sa, M); \
+    ret.sb = __amdil_class_f32(x.sb, M); \
+    ret.sc = __amdil_class_f32(x.sc, M); \
+    ret.sd = __amdil_class_f32(x.sd, M); \
+    ret.se = __amdil_class_f32(x.se, M); \
+    ret.sf = __amdil_class_f32(x.sf, M); \
+    return ret; \
+}
+
+
+#define DC(F,M) \
+__attribute__((overloadable, always_inline)) int \
+F(double x) \
+{ \
+    return __amdil_class_f64(x, M) & 1; \
+} \
+__attribute__((overloadable, always_inline)) long2 \
+F(double2 x) \
+{ \
+    long2 ret; \
+    ret.s0 = __amdil_class_f64(x.s0, M); \
+    ret.s1 = __amdil_class_f64(x.s1, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) long3 \
+F(double3 x) \
+{ \
+    long3 ret; \
+    ret.s0 = __amdil_class_f64(x.s0, M); \
+    ret.s1 = __amdil_class_f64(x.s1, M); \
+    ret.s2 = __amdil_class_f64(x.s2, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) long4 \
+F(double4 x) \
+{ \
+    long4 ret; \
+    ret.s0 = __amdil_class_f64(x.s0, M); \
+    ret.s1 = __amdil_class_f64(x.s1, M); \
+    ret.s2 = __amdil_class_f64(x.s2, M); \
+    ret.s3 = __amdil_class_f64(x.s3, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) long8 \
+F(double8 x) \
+{ \
+    long8 ret; \
+    ret.s0 = __amdil_class_f64(x.s0, M); \
+    ret.s1 = __amdil_class_f64(x.s1, M); \
+    ret.s2 = __amdil_class_f64(x.s2, M); \
+    ret.s3 = __amdil_class_f64(x.s3, M); \
+    ret.s4 = __amdil_class_f64(x.s4, M); \
+    ret.s5 = __amdil_class_f64(x.s5, M); \
+    ret.s6 = __amdil_class_f64(x.s6, M); \
+    ret.s7 = __amdil_class_f64(x.s7, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) long16 \
+F(double16 x) \
+{ \
+    long16 ret; \
+    ret.s0 = __amdil_class_f64(x.s0, M); \
+    ret.s1 = __amdil_class_f64(x.s1, M); \
+    ret.s2 = __amdil_class_f64(x.s2, M); \
+    ret.s3 = __amdil_class_f64(x.s3, M); \
+    ret.s4 = __amdil_class_f64(x.s4, M); \
+    ret.s5 = __amdil_class_f64(x.s5, M); \
+    ret.s6 = __amdil_class_f64(x.s6, M); \
+    ret.s7 = __amdil_class_f64(x.s7, M); \
+    ret.s8 = __amdil_class_f64(x.s8, M); \
+    ret.s9 = __amdil_class_f64(x.s9, M); \
+    ret.sa = __amdil_class_f64(x.sa, M); \
+    ret.sb = __amdil_class_f64(x.sb, M); \
+    ret.sc = __amdil_class_f64(x.sc, M); \
+    ret.sd = __amdil_class_f64(x.sd, M); \
+    ret.se = __amdil_class_f64(x.se, M); \
+    ret.sf = __amdil_class_f64(x.sf, M); \
+    return ret; \
+}
+
+FC(isfinite, (NNOR|NSUB|NZER|PZER|PSUB|PNOR))
+FC(isinf, (NINF|PINF))
+FC(isnan, (SNAN|QNAN))
+FC(isnormal, (NNOR|PNOR))
+
+DC(isfinite, (NNOR|NSUB|NZER|PZER|PSUB|PNOR))
+DC(isinf, (NINF|PINF))
+DC(isnan, (SNAN|QNAN))
+DC(isnormal, (NNOR|PNOR))
+

diff --git a/amd-builtins/misc/counter.cl b/amd-builtins/misc/counter.cl
new file mode 100644
index 0000000..8aef73b
--- /dev/null
+++ b/amd-builtins/misc/counter.cl

@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifdef USE_COUNTER
+
+#pragma OPENCL EXTENSION cl_amd_atomic_counters32 : enable
+
+extern uint __amdil_append_alloc_i32(counter32_t);
+extern uint __amdil_append_consume_i32(counter32_t);
+
+__attribute__((overloadable, always_inline)) uint
+atomic_inc(counter32_t p)
+{
+    return __amdil_append_alloc_i32(p);
+}
+
+__attribute__((overloadable, always_inline)) uint
+atomic_dec(counter32_t p)
+{
+    // The instruction returns the updated value
+    return __amdil_append_consume_i32(p) + 1U;
+}
+
+#endif
+

diff --git a/amd-builtins/misc/floattointconversion.h b/amd-builtins/misc/floattointconversion.h
new file mode 100644
index 0000000..dc1f7f2
--- /dev/null
+++ b/amd-builtins/misc/floattointconversion.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+static inline  double float_uint_to_double(uint x)
+{
+    double d;
+    float f = as_float(x);
+
+	// Fix up subnormal, if necessary
+	uint fmant = x & 0x007fffff;
+	float temp = as_float(fmant | 0x3f800000);
+	temp -= 1.0;
+	d = (float)temp;
+	ulong ld = as_ulong(d);
+	ld -= 0x07e0000000000000;
+	d = as_double(ld);
+	d = fmant ? d : 0.0;
+	d = x & 0x80000000 ? -d : d;
+	d = (f != 0.0) ? (double)f : d;
+
+	return d;
+
+}
+
+static inline uint double_to_float_uint(double d)
+{
+	uint dlow, dhigh, dsign;
+	float f = (float)d;
+	uint uf;
+
+	double dabs = (d < 0.) ? -d : d;
+
+	// Fix up subnormal
+	ulong ld;
+	ld = as_ulong(d);
+	dlow = ld;
+	dhigh = ld >> 32;
+	dsign = dhigh & 0x80000000;
+
+	int dexp = (dhigh >> 20) & 0x7ff;
+	int shiftcount = 0x381 - dexp;
+	dhigh &= 0x000fffff;
+	dhigh |= 0x00100000;
+	dhigh = (dhigh << 3) | (dlow >> 29);
+	dlow <<= 3;
+	uint extrabits = dlow << (32 - shiftcount);
+	dlow = (dlow >> shiftcount) | (dhigh << (32 - shiftcount));
+	dhigh >>= shiftcount;
+	dhigh = ((dlow > 0x80000000u) ||
+	((dlow == 0x80000000u) && ((dhigh & 1) | extrabits))) ?
+	   dhigh + 1 : dhigh;
+	uf = dhigh | dsign;
+	uf =  dabs >= 7.0064923216240869000000e-046 ? uf : 0;
+
+
+	uf = f != 0. ? as_uint(f) : uf;
+    return uf;
+}
\ No newline at end of file

diff --git a/amd-builtins/misc/minmax.cl b/amd-builtins/misc/minmax.cl
new file mode 100644
index 0000000..eaf6ef1
--- /dev/null
+++ b/amd-builtins/misc/minmax.cl

@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//#define G(F,T,N) \
+//__attribute__((overloadable, always_inline)) T##N \
+//F(T##N x, T##N y) \
+//{ \
+//    T##N ret; \
+//    ret.lo = F(x.lo, y.lo); \
+//    ret.hi = F(x.hi, y.hi); \
+//    return ret; \
+//}
+//
+//G(min,float,16)
+//G(min,float,8)
+
+//__attribute__((overloadable, always_inline)) float4
+//min(float4 x, float4 y)
+//{
+//    return __amdil_min_v4f32(x, y);
+//}
+//
+//__attribute__((overloadable, always_inline)) float3
+//min(float3 x, float3 y)
+//{
+//#if defined VEC3_BACKEND
+//    return __amdil_min_v3f32(x, y);
+//#else
+//    float3 ret;
+//    ret.xy = min(x.xy, y.xy);
+//    ret.z = min(x.z, y.z);
+//    return ret;
+//#endif
+//}
+//
+//__attribute__((overloadable, always_inline)) float2
+//min(float2 x, float2 y)
+//{
+//    return __amdil_min_v2f32(x, y);
+//}
+
+extern __attribute__((pure)) float __hsail_min_f32(float,float);
+
+__attribute__((weak, overloadable, always_inline)) float
+min(float x, float y)
+{
+    return __hsail_min_f32(x, y);
+}
+
+//G(min,double,16)
+//G(min,double,8)
+//G(min,double,4)
+//G(min,double,3)
+//G(min,double,2)
+
+extern __attribute__((pure)) double __hsail_min_f64(double,double);
+
+__attribute__((weak, overloadable, always_inline)) double
+min(double x, double y)
+{
+    return __hsail_min_f64(x, y);
+}
+
+//G(max,float,16)
+//G(max,float,8)
+//
+//__attribute__((overloadable, always_inline)) float4
+//max(float4 x, float4 y)
+//{
+//    return __amdil_max_v4f32(x, y);
+//}
+//
+//__attribute__((overloadable, always_inline)) float3
+//max(float3 x, float3 y)
+//{
+//#if defined VEC3_BACKEND
+//    return __amdil_max_v3f32(x, y);
+//#else
+//    float3 ret;
+//    ret.xy = max(x.xy, y.xy);
+//    ret.z = max(x.z, y.z);
+//    return ret;
+//#endif
+//}
+//
+//__attribute__((overloadable, always_inline)) float2
+//max(float2 x, float2 y)
+//{
+//    return __amdil_max_v2f32(x, y);
+//}
+
+extern __attribute__((pure)) float __hsail_max_f32(float,float);
+
+__attribute__((weak, overloadable, always_inline)) float
+max(float x, float y)
+{
+    return __hsail_max_f32(x, y);
+}
+
+//G(max,double,16)
+//G(max,double,8)
+//G(max,double,4)
+//G(max,double,3)
+//G(max,double,2)
+
+extern __attribute__((pure)) double __hsail_max_f64(double,double);
+
+__attribute__((weak, overloadable, always_inline)) double
+max(double x, double y)
+{
+    return __hsail_max_f64(x, y);
+}

diff --git a/amd-builtins/misc/printf_alloc.cl b/amd-builtins/misc/printf_alloc.cl
new file mode 100644
index 0000000..7a1f277
--- /dev/null
+++ b/amd-builtins/misc/printf_alloc.cl

@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#if __OPENCL_C_VERSION__ >= 200
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+extern __attribute__((const)) uint  __hsail_ld_kernarg_u32(uint);
+extern __attribute__((const)) ulong __hsail_ld_kernarg_u64(uint);
+
+#define OFFSET 8
+
+__global char* __printf_alloc(unsigned int bytes)
+{
+  // Functionality:
+  // The __get_printf_ptr is a builtin that is replaced by
+  // the backend.  The first 8 bytes of the buffer returned
+  // by the call are skipped.
+  // buffer[0] maintains the latest offset in the buffer. The value
+  //  is updated using atomic adds for the number of bytes
+  //  requested in the function argument.
+  // buffer[4] has the size of the buffer
+  //  when access needs to go over buffer[0] + size of buffer
+  //  i.e. we have the buffer overflow condition -- we return NULL
+  // The buffer size is hard limited by sizeof(uint)
+  //
+  __global char* ptr;
+  if (sizeof(size_t) == 4)
+    ptr = (__global char*) __hsail_ld_kernarg_u32(12);
+  else
+    ptr = (__global char*) __hsail_ld_kernarg_u64(24);
+  uint size = ((global uint *)ptr)[1];
+  uint offset = atomic_load_explicit((__global atomic_uint *)ptr,
+                                     memory_order_acquire, memory_scope_device);
+  for (;;) {
+    if (OFFSET + offset + bytes > size)
+      return NULL;
+    if (atomic_compare_exchange_strong_explicit((__global atomic_uint *)ptr,
+        &offset, offset+bytes, memory_order_acq_rel, memory_order_acquire,
+        memory_scope_device))
+      break;
+  }
+  return ptr + OFFSET + offset;
+}
+#endif

diff --git a/amd-builtins/misc/relationals.cl b/amd-builtins/misc/relationals.cl
new file mode 100644
index 0000000..b220128
--- /dev/null
+++ b/amd-builtins/misc/relationals.cl

@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// Vector expansions for HSAIL relationals
+
+#define UnaryRelationalVector(oty, ity, fun, mgl) \
+__attribute__((weak,always_inline)) \
+oty##16 __##fun##_16##mgl(ity##16 a) \
+{ \
+	oty##16 c; \
+  c.lo = fun(a.lo); \
+	c.hi = fun(a.hi); \
+	return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##8 __##fun##_8##mgl(ity##8 a) \
+{ \
+	oty##8 c; \
+	c.lo = fun(a.lo); \
+	c.hi = fun(a.hi); \
+	return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##4 __##fun##_4##mgl(ity##4 a) \
+{ \
+	oty##4 c; \
+	c.lo = fun(a.lo); \
+	c.hi = fun(a.hi); \
+	return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##3 __##fun##_3##mgl(ity##3 a) \
+{ \
+	oty##3 c; \
+	c.xy = fun(a.xy); \
+	c.z = fun(a.z); \
+	return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##2 __##fun##_2##mgl(ity##2 a) \
+{ \
+	oty##2 c; \
+	c.lo = fun(a.lo); \
+	c.hi = fun(a.hi); \
+	return c; \
+}
+
+UnaryRelationalVector(int, float, isfinite, f32)
+UnaryRelationalVector(long, double, isfinite, f64)
+
+UnaryRelationalVector(int, float, isinf, f32)
+UnaryRelationalVector(long, double, isinf, f64)
+
+UnaryRelationalVector(int, float, isnan, f32)
+UnaryRelationalVector(long, double, isnan, f64)
+
+UnaryRelationalVector(int, float, isnormal, f32)
+UnaryRelationalVector(long, double, isnormal, f64)
+

diff --git a/amd-builtins/misc/synchronization.cl b/amd-builtins/misc/synchronization.cl
new file mode 100644
index 0000000..2e29c50
--- /dev/null
+++ b/amd-builtins/misc/synchronization.cl

@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern void __hsail_memfence();
+extern void __hsail_memfence_global();
+extern void __hsail_memfence_group();
+extern void __hsail_barrier();
+
+void mem_fence_impl(uint val) {
+  if (val == CLK_GLOBAL_MEM_FENCE) {
+    __hsail_memfence_global();
+  } else if (val == CLK_LOCAL_MEM_FENCE) {
+    __hsail_memfence_group();
+  } else {
+    __hsail_memfence();
+  }
+}
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+void mem_fence(uint val) {
+  mem_fence_impl(val);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+void read_mem_fence(uint val) {
+  mem_fence_impl(val);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+void write_mem_fence(uint val) {
+  mem_fence_impl(val);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline))
+void barrier(uint flags) {
+  __hsail_barrier();
+}

diff --git a/amd-builtins/misc/workitem.cl b/amd-builtins/misc/workitem.cl
new file mode 100644
index 0000000..01244a0
--- /dev/null
+++ b/amd-builtins/misc/workitem.cl

@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern __attribute__((const)) uint __hsail_get_global_size(uint);
+extern __attribute__((const)) uint __hsail_get_global_id(uint);
+extern __attribute__((const)) uint __hsail_workgroup_size(uint);
+extern __attribute__((const)) uint __hsail_currentworkgroup_size(uint);
+extern __attribute__((const)) uint __hsail_get_local_id(uint);
+extern __attribute__((const)) uint __hsail_get_num_groups(uint);
+extern __attribute__((const)) uint __hsail_get_group_id(uint);
+extern __attribute__((const)) uint   __hsail_get_work_dim(void);
+extern __attribute__((const)) uint  __hsail_ld_kernarg_u32(uint);
+extern __attribute__((const)) ulong __hsail_ld_kernarg_u64(uint);
+extern __attribute__((pure)) uint __hsail_workitemid_flat(void);
+
+// FIXME - this will change to ulong soon
+extern __attribute__((pure)) uint __hsail_workitemid_flatabs(void);
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_offset(uint d) {
+  if (sizeof(size_t) == 4) { // 32 bit
+    switch(d) {
+      default:
+        return 0;
+      case 0:
+        return __hsail_ld_kernarg_u32(0);
+      case 1:
+        return __hsail_ld_kernarg_u32(4);
+      case 2:
+        return __hsail_ld_kernarg_u32(8);
+     }
+  } else { // 64 bit
+    switch(d) {
+      default:
+        return 0;
+      case 0:
+        return __hsail_ld_kernarg_u64(0);
+      case 1:
+        return __hsail_ld_kernarg_u64(8);
+      case 2:
+        return __hsail_ld_kernarg_u64(16);
+    }
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_id(uint d) {
+  size_t id;
+  size_t o = get_global_offset(d);
+  switch(d) {
+    default:
+      id = 0;
+      break;
+    case 0:
+      id = __hsail_get_global_id(0);
+      break;
+    case 1:
+      id = __hsail_get_global_id(1);
+      break;
+    case 2:
+      id = __hsail_get_global_id(2);
+      break;
+  }
+
+  return o + id;
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_local_id(uint d) {
+  switch(d) {
+    default:
+      return 0;
+    case 0:
+      return __hsail_get_local_id(0);
+    case 1:
+      return __hsail_get_local_id(1);
+    case 2:
+      return __hsail_get_local_id(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_group_id(uint d) {
+  switch(d) {
+    default:
+      return 0;
+    case 0:
+      return __hsail_get_group_id(0);
+    case 1:
+      return __hsail_get_group_id(1);
+    case 2:
+      return __hsail_get_group_id(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_size(uint d) {
+  switch(d) {
+    default:
+      return 1;
+    case 0:
+      return __hsail_get_global_size(0);
+    case 1:
+      return __hsail_get_global_size(1);
+    case 2:
+      return __hsail_get_global_size(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_local_size(uint d) {
+  switch(d) {
+    default:
+      return 1;
+    case 0:
+      return __hsail_currentworkgroup_size(0);
+    case 1:
+      return __hsail_currentworkgroup_size(1);
+    case 2:
+      return __hsail_currentworkgroup_size(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_num_groups(uint d) {
+  switch(d) {
+    default:
+      return 1;
+    case 0:
+      return __hsail_get_num_groups(0);
+    case 1:
+      return __hsail_get_num_groups(1);
+    case 2:
+      return __hsail_get_num_groups(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+uint get_work_dim() {
+  return __hsail_get_work_dim();
+}
+
+#if __OPENCL_C_VERSION__ >= 200
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_enqueued_local_size(uint d) {
+  switch(d) {
+    default:
+      return 1;
+    case 0:
+      return __hsail_workgroup_size(0);
+    case 1:
+      return __hsail_workgroup_size(1);
+    case 2:
+      return __hsail_workgroup_size(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_linear_id(void) {
+#if defined NO_WORKITEM_FLATABS
+    return (__hsail_get_global_id(2) * __hsail_get_global_size(1) +
+            __hsail_get_global_id(1)) * __hsail_get_global_size(0) +
+	    __hsail_get_global_id(0);
+#else
+    return __hsail_workitemid_flatabs();
+#endif
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_local_linear_id(void) {
+    return __hsail_workitemid_flat();
+}
+
+#endif
+

diff --git a/amd-builtins/pipes/commitp.cl b/amd-builtins/pipes/commitp.cl
new file mode 100644
index 0000000..9068c9b
--- /dev/null
+++ b/amd-builtins/pipes/commitp.cl

@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__COMMIT_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+#define __COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__COMMIT_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+// Work group functions
+
+#define __WORK_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__work_group_commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__work_group_commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+#define __WORK_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__work_group_commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__work_group_commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+// sub group functions
+
+#define __SUB_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__sub_group_commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__sub_group_commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+#define __SUB_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__sub_group_commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__sub_group_commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+

diff --git a/amd-builtins/pipes/getp.cl b/amd-builtins/pipes/getp.cl
new file mode 100644
index 0000000..896a9f5
--- /dev/null
+++ b/amd-builtins/pipes/getp.cl

@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __GET_PIPE_NUM_PACKETS_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) uint \
+__get_pipe_num_packets_internal_##SIZE(__global struct pipeimp* p) \
+{ \
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    return (uint)(wi - ri); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__GET_PIPE_NUM_PACKETS_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) uint
+__get_pipe_num_packets_internal_user(__global struct pipeimp* p, size_t size)
+{
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    return (uint)(wi - ri);
+}
+
+#define __GET_PIPE_MAX_PACKETS_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) uint \
+__get_pipe_max_packets_internal_##SIZE(__global struct pipeimp* p) \
+{ \
+    return (uint)p->end_idx; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__GET_PIPE_MAX_PACKETS_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) uint
+__get_pipe_max_packets_internal_user(__global struct pipeimp* p, size_t size)
+{
+    return (uint)p->end_idx;
+}
+

diff --git a/amd-builtins/pipes/memcpyia.cl b/amd-builtins/pipes/memcpyia.cl
new file mode 100644
index 0000000..9f57046
--- /dev/null
+++ b/amd-builtins/pipes/memcpyia.cl

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((always_inline, weak)) void 
+__memcpy_internal_aligned(void *d, const void *s, size_t size, size_t align)
+{
+    if (align == 2) {
+	short *d2 = (short *)d;
+	short *s2 = (short *)s;
+	short *e2 = s2 + size/2;
+
+	while (s2 < e2)
+	    *d2++ = *s2++;
+    } else if (align == 4) {
+	int *d4 = (int *)d;
+	int *s4 = (int *)s;
+	int *e4 = s4 + size/4;
+
+	while (s4 < e4)
+	    *d4++ = *s4++;
+    } else if (align == 8) {
+	long *d8 = (long *)d;
+	long *s8 = (long *)s;
+	long *e8 = s8 + size/8;
+
+	while (s8 < e8)
+	    *d8++ = *s8++;
+    } else if (align == 16) {
+	long2 *d16 = (long2 *)d;
+	long2 *s16 = (long2 *)s;
+	long2 *e16 = s16 + size/16;
+
+	while (s16 < e16)
+	    *d16++ = *s16++;
+    } else if (align == 32 || align == 64 || align == 128) {
+	long4 *d32 = (long4 *)d;
+	long4 *s32 = (long4 *)s;
+	long4 *e32 = s32 + size/32;
+
+	while (s32 < e32)
+	    *d32++ = *s32++;
+    } else {
+	char *d1 = (char *)d;
+	char *s1 = (char *)s;
+	char *e1 = s1 + size;
+
+	while (s1 < e1)
+	    *d1++ = *s1++;
+    }
+}
+

diff --git a/amd-builtins/pipes/pipes.h b/amd-builtins/pipes/pipes.h
new file mode 100644
index 0000000..7a98fc1
--- /dev/null
+++ b/amd-builtins/pipes/pipes.h

@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef _PIPES_H
+#define _PIPES_H 1
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+ 
+#define DO_PIPE_INTERNAL_SIZE(F) \
+F(1,uchar) \
+F(2,ushort) \
+F(4,uint) \
+F(8,ulong) \
+F(16,ulong2) \
+F(32,ulong4) \
+F(64,ulong8) \
+F(128,ulong16)
+
+struct pipeimp {
+    atomic_size_t read_idx;
+    atomic_size_t write_idx;
+    size_t end_idx;
+    uchar pad[128 - 3*sizeof(size_t)];
+    uchar packets[1];
+};
+
+extern void __memcpy_internal_aligned(void *, const void *, size_t, size_t);
+
+static inline size_t
+reserve(volatile __global atomic_size_t *pidx, size_t lim, size_t n)
+{
+    size_t idx = atomic_load_explicit(pidx, memory_order_acquire, memory_scope_device);
+
+    for (;;) {
+	if (idx + n > lim)
+	    return ~(size_t)0;
+
+	if (atomic_compare_exchange_strong_explicit(pidx, &idx, idx + n, memory_order_acq_rel, memory_order_acquire, memory_scope_device))
+	    break;
+    }
+
+    return idx;
+}
+
+#endif // _PIPES_H
+

diff --git a/amd-builtins/pipes/readp.cl b/amd-builtins/pipes/readp.cl
new file mode 100644
index 0000000..7613d3f
--- /dev/null
+++ b/amd-builtins/pipes/readp.cl

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__read_pipe_internal_##SIZE(__global struct pipeimp* p, STYPE* ptr) \
+{ \
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ri = reserve(&p->read_idx, wi, 1); \
+    if (ri == ~(size_t)0) \
+        return -1; \
+ \
+    *ptr = ((__global STYPE *)p->packets)[ri % p->end_idx]; \
+ \
+    if (ri == wi-1) { \
+        atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+        atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+    }\
+\
+    return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__read_pipe_internal_user( __global struct pipeimp* p, void* ptr, size_t size, size_t align)
+{
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    size_t ri = reserve(&p->read_idx, wi, 1);
+    if (ri == ~(size_t)0)
+        return -1;
+
+    __memcpy_internal_aligned(ptr, p->packets + (ri % p->end_idx)*size, size, align);
+
+    if (ri == wi-1) {
+        atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+        atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+    }
+
+    return 0;
+}
+
+#define __READ_PIPE_INDEXED_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__read_pipe_reserved_internal_##SIZE(__global struct pipeimp* p, size_t rid, uint i, STYPE* ptr)  \
+{ \
+    rid += i; \
+    *ptr = ((__global STYPE *)p->packets)[rid % p->end_idx]; \
+ \
+    return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__READ_PIPE_INDEXED_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__read_pipe_reserved_internal_user(__global struct pipeimp* p, size_t rid, uint i, void *ptr, size_t size, size_t align)
+{
+    rid += i;
+
+    __memcpy_internal_aligned(ptr, p->packets + (rid % p->end_idx)*size, size, align);
+
+    return 0;
+}
+

diff --git a/amd-builtins/pipes/reservep.cl b/amd-builtins/pipes/reservep.cl
new file mode 100644
index 0000000..991041e
--- /dev/null
+++ b/amd-builtins/pipes/reservep.cl

@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#include "pipes.h"
+#include "../workgroup/wg.h"
+
+#define __RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    size_t rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+    if (rid + num_packets == wi) { \
+        atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+        atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+    } \
+ \
+    return rid; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__RESERVE_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    size_t rid = reserve(&p->read_idx, wi, num_packets);
+
+    if (rid + num_packets == wi) {
+        atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+        atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+    }
+
+    return rid;
+}
+
+#define __RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ei = p->end_idx; \
+    return reserve(&p->write_idx, ri + ei, num_packets); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__RESERVE_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t ei = p->end_idx;
+    return reserve(&p->write_idx, ri + ei, num_packets);
+}
+
+// Work group functions
+
+#define __WORK_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__work_group_reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    __local size_t *t = (__local size_t *)__wg_scratch; \
+ \
+    if ((int)get_local_linear_id() == 0) { \
+        size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+        size_t rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+        if (rid + num_packets == wi) { \
+            atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+            atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+        } \
+ \
+        *t = rid; \
+    } \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    return *t; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__work_group_reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+    __local size_t *t = (__local size_t *)__wg_scratch;
+
+    if ((int)get_local_linear_id() == 0) {
+        size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+        size_t rid = reserve(&p->read_idx, wi, num_packets);
+
+        if (rid + num_packets == wi) {
+            atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+            atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+        }
+
+        *t = rid;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    return *t;
+}
+
+#define __WORK_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__work_group_reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    __local size_t *t = (__local size_t *)__wg_scratch; \
+ \
+    if ((int)get_local_linear_id() == 0) { \
+        size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+        size_t ei = p->end_idx; \
+        *t = reserve(&p->write_idx, ri + ei, num_packets); \
+    } \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    return *t; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__work_group_reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+    __local size_t *t = (__local size_t *)__wg_scratch;
+
+    if ((int)get_local_linear_id() == 0) {
+        size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+        size_t ei = p->end_idx;
+        *t = reserve(&p->write_idx, ri + ei, num_packets);
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    return *t;
+}
+
+// sub group functions
+
+#define __SUB_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__sub_group_reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t rid = ~(size_t)0; \
+ \
+    if (get_sub_group_local_id() == 0) { \
+        size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+        rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+        if (rid + num_packets == wi) { \
+            atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+            atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+        } \
+    } \
+ \
+    return sub_group_broadcast(rid, 0); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__sub_group_reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+    size_t rid = ~(size_t)0;
+
+    if (get_sub_group_local_id() == 0) {
+        size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+        rid = reserve(&p->read_idx, wi, num_packets);
+
+        if (rid + num_packets == wi) {
+            atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+            atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+        }
+    }
+
+    return sub_group_broadcast(rid, 0);
+}
+
+#define __SUB_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__sub_group_reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t rid = ~(size_t)0; \
+ \
+    if (get_sub_group_local_id() == 0) { \
+        size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+        size_t ei = p->end_idx; \
+        rid = reserve(&p->write_idx, ri + ei, num_packets); \
+    } \
+ \
+    return sub_group_broadcast(rid, 0); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__sub_group_reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+     size_t rid = ~(size_t)0;
+
+    if (get_sub_group_local_id() == 0) {
+        size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+        size_t ei = p->end_idx;
+        rid = reserve(&p->write_idx, ri + ei, num_packets);
+    }
+
+    return sub_group_broadcast(rid, 0);
+}
+

diff --git a/amd-builtins/pipes/validp.cl b/amd-builtins/pipes/validp.cl
new file mode 100644
index 0000000..512b7d6
--- /dev/null
+++ b/amd-builtins/pipes/validp.cl

@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+
+__attribute__((always_inline, weak)) bool
+__is_valid_reserve_id(size_t rid)
+{
+    return rid != ~(size_t)0;
+}
+

diff --git a/amd-builtins/pipes/writep.cl b/amd-builtins/pipes/writep.cl
new file mode 100644
index 0000000..22cf6fb
--- /dev/null
+++ b/amd-builtins/pipes/writep.cl

@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__write_pipe_internal_##SIZE(__global struct pipeimp* p, const STYPE* ptr) \
+{ \
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ei = p->end_idx; \
+    size_t wi = reserve(&p->write_idx, ri+ei, 1); \
+    if (wi == ~(size_t)0) \
+        return -1; \
+ \
+    ((__global STYPE *)p->packets)[wi % ei] = *ptr; \
+    return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__write_pipe_internal_user(__global struct pipeimp* p, const void* ptr, size_t size, size_t align)
+{
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t ei = p->end_idx;
+    size_t wi = reserve(&p->write_idx, ri+ei, 1);
+    if (wi == ~(size_t)0)
+        return -1;
+
+    __memcpy_internal_aligned(p->packets + (wi % ei)*size, ptr, size, align);
+
+    return 0;
+}
+
+#define __WRITE_PIPE_INDEXED_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__write_pipe_reserved_internal_##SIZE(__global struct pipeimp* p, size_t rid, uint i, const STYPE* ptr)  \
+{ \
+    rid += i; \
+    ((__global STYPE *)p->packets)[rid % p->end_idx] = *ptr; \
+    return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WRITE_PIPE_INDEXED_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__write_pipe_reserved_internal_user(__global struct pipeimp* p, size_t rid, uint i, const void *ptr, size_t size, size_t align)
+{
+    rid += i;
+
+    __memcpy_internal_aligned(p->packets + (rid % p->end_idx)*size, ptr, size, align);
+
+    return 0;
+}
+

diff --git a/amd-builtins/subgroup/subany.cl b/amd-builtins/subgroup/subany.cl
new file mode 100644
index 0000000..5b76355
--- /dev/null
+++ b/amd-builtins/subgroup/subany.cl

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern __attribute__((pure)) uint __hsail_activelanecount_wavewidth_u32_b1(bool);
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) int
+sub_group_all(int predicate)
+{
+    return __hsail_activelanecount_wavewidth_u32_b1(predicate != 0) == __hsail_activelanecount_wavewidth_u32_b1(true);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) int
+sub_group_any(int predicate)
+{
+    return __hsail_activelanecount_wavewidth_u32_b1(predicate != 0) != 0;
+}
+

diff --git a/amd-builtins/subgroup/subbar.cl b/amd-builtins/subgroup/subbar.cl
new file mode 100644
index 0000000..9424af3
--- /dev/null
+++ b/amd-builtins/subgroup/subbar.cl

@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern void __hsail_wavebarrier(void);
+
+__attribute__((overloadable,weak,always_inline)) void
+sub_group_barrier(cl_mem_fence_flags flags)
+{
+    sub_group_barrier(flags, memory_scope_sub_group);
+}
+
+__attribute__((overloadable,weak,always_inline)) void
+sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope)
+{
+    // What about CLK_IMAGE_MEM_FENCE
+    atomic_work_item_fence(flags, memory_order_release, scope);
+    __hsail_wavebarrier();
+    atomic_work_item_fence(flags, memory_order_acquire, scope);
+}
+

diff --git a/amd-builtins/subgroup/subbcast.cl b/amd-builtins/subgroup/subbcast.cl
new file mode 100644
index 0000000..9adece6
--- /dev/null
+++ b/amd-builtins/subgroup/subbcast.cl

@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival);
+extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival);
+extern void __hsail_wavebarrier();
+
+__attribute__((always_inline)) static uint
+bcast32(uint a, uint lid)
+{
+    a = __hsail_activelaneshuffle_wavewidth_b32(a, lid, 0U, false);
+    __hsail_wavebarrier();
+    return a;
+}
+
+extern __attribute__((overloadable, alias("bcast32"))) uint sub_group_broadcast(uint, uint);
+extern __attribute__((overloadable, alias("bcast32"))) int sub_group_broadcast(int, uint);
+extern __attribute__((overloadable, alias("bcast32"))) float sub_group_broadcast(float, uint);
+
+
+__attribute__((always_inline)) static ulong
+bcast64(ulong a, uint lid)
+{
+    a = __hsail_activelaneshuffle_wavewidth_b64(a, lid, 0UL, false);
+    __hsail_wavebarrier();
+    return a;
+}
+
+extern __attribute__((overloadable, alias("bcast64"))) ulong sub_group_broadcast(ulong, uint);
+extern __attribute__((overloadable, alias("bcast64"))) long sub_group_broadcast(long, uint);
+extern __attribute__((overloadable, alias("bcast64"))) double sub_group_broadcast(double, uint);
+

diff --git a/amd-builtins/subgroup/subget.cl b/amd-builtins/subgroup/subget.cl
new file mode 100644
index 0000000..ab74690
--- /dev/null
+++ b/amd-builtins/subgroup/subget.cl

@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern __attribute__((pure)) uint __hsail_workitemid_flat(void);
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_sub_group_size(void)
+{
+    uint wgs = (uint)get_local_size(0) * (uint)get_local_size(1) * (uint)get_local_size(2);
+    uint lid = (uint)get_local_linear_id();
+    return min(64U, wgs - (lid & ~63U));
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_max_sub_group_size(void)
+{
+    uint wgs = (uint)get_enqueued_local_size(0) * get_enqueued_local_size(1) * get_enqueued_local_size(2);
+    return min(64U, wgs);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_num_sub_groups(void)
+{
+    uint wgs = (uint)get_local_size(0) * (uint)get_local_size(1) * (uint)get_local_size(2);
+    return (wgs + 63U) >> 6U;
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_enqueued_num_sub_groups(void)
+{
+    uint wgs = (uint)get_enqueued_local_size(0) * get_enqueued_local_size(1) * get_enqueued_local_size(2);
+    return (wgs + 63U) >> 6U;
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_sub_group_id(void)
+{
+    return __hsail_workitemid_flat() >> 6U;
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_sub_group_local_id(void)
+{
+    return __hsail_workitemid_flat() & 0x3fU;
+}
+

diff --git a/amd-builtins/subgroup/subreduce.cl b/amd-builtins/subgroup/subreduce.cl
new file mode 100644
index 0000000..d706c3d
--- /dev/null
+++ b/amd-builtins/subgroup/subreduce.cl

@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern uint __hsail_get_lane_id(void);
+extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival);
+extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival);
+extern void __hsail_wavebarrier();
+
+#define GENA(TY,SZ,AO,AI,Z) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_reduce_add(TY a) \
+{ \
+    uint lid = __hsail_get_lane_id(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^1, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^2, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^4, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^8, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^16, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^32, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    return a; \
+}
+
+GENA(int,32,as_int,as_uint,0)
+GENA(uint,32,,,0U)
+GENA(long,64,as_long,as_ulong,0L)
+GENA(ulong,64,,,0UL)
+GENA(float,32,as_float,as_uint,0.0f)
+GENA(double,64,as_double,as_ulong,0.0)
+
+#define GENO(TY,SZ,OP,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_reduce_##OP(TY a) \
+{ \
+    uint lid = __hsail_get_lane_id(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^1, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^2, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^4, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^8, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^16, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^32, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    return a; \
+}
+
+GENO(int,32,min,as_int,as_uint,INT_MAX)
+GENO(uint,32,min,,,UINT_MAX)
+GENO(long,64,min,as_long,as_ulong,LONG_MAX)
+GENO(ulong,64,min,,,ULONG_MAX)
+GENO(float,32,min,as_float,as_uint,INFINITY)
+GENO(double,64,min,as_double,as_ulong,(double)INFINITY)
+
+GENO(int,32,max,as_int,as_uint,INT_MIN)
+GENO(uint,32,max,,,0U)
+GENO(long,64,max,as_long,as_ulong,LONG_MIN)
+GENO(ulong,64,max,,,0UL)
+GENO(float,32,max,as_float,as_uint,-INFINITY)
+GENO(double,64,max,as_double,as_ulong,-(double)INFINITY)
+
+#endif
+

diff --git a/amd-builtins/subgroup/subscan.cl b/amd-builtins/subgroup/subscan.cl
new file mode 100644
index 0000000..f0cddb1
--- /dev/null
+++ b/amd-builtins/subgroup/subscan.cl

@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern uint __hsail_get_lane_id(void);
+extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival);
+extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival);
+extern void __hsail_wavebarrier();
+
+// Define exclusive in terms of inclusive
+
+#define EGEN(TY,OP,SZ,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_scan_exclusive_##OP(TY a) \
+{ \
+    a = sub_group_scan_inclusive_##OP(a); \
+    uint lid = __hsail_get_lane_id(); \
+    a = AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1)); \
+    return a; \
+}
+
+EGEN(int,add,32,as_int,as_uint,0)
+EGEN(int,min,32,as_int,as_uint,INT_MAX)
+EGEN(int,max,32,as_int,as_uint,INT_MIN)
+
+EGEN(uint,add,32,,,0)
+EGEN(uint,min,32,,,UINT_MAX)
+EGEN(uint,max,32,,,0U)
+
+EGEN(long,add,64,as_long,as_ulong,0L)
+EGEN(long,min,64,as_long,as_ulong,LONG_MAX)
+EGEN(long,max,64,as_long,as_ulong,LONG_MIN)
+
+EGEN(ulong,add,64,,,0UL)
+EGEN(ulong,min,64,,,ULONG_MAX)
+EGEN(ulong,max,64,,,0UL)
+
+EGEN(float,add,32,as_float,as_uint,0.0f)
+EGEN(float,min,32,as_float,as_uint,INFINITY)
+EGEN(float,max,32,as_float,as_uint,-INFINITY)
+
+EGEN(double,add,64,as_double,as_ulong,0.0)
+EGEN(double,min,64,as_double,as_ulong,(double)INFINITY)
+EGEN(double,max,64,as_double,as_ulong,-(double)INFINITY)
+
+// Now inclusive scan
+
+#define IGENA(TY,SZ,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_scan_inclusive_add(TY a) \
+{ \
+    uint lid = __hsail_get_lane_id(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-2)&0x3f, AI(ID), lid < 2)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-4)&0x3f, AI(ID), lid < 4)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-8)&0x3f, AI(ID), lid < 8)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-16)&0x3f, AI(ID), lid < 16)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-32)&0x3f, AI(ID), lid < 32)); \
+    __hsail_wavebarrier(); \
+    return a; \
+}
+
+#define IGENO(TY,SZ,OP,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_scan_inclusive_##OP(TY a) \
+{ \
+    uint lid = __hsail_get_lane_id(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-2)&0x3f, AI(ID), lid < 2))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-4)&0x3f, AI(ID), lid < 4))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-8)&0x3f, AI(ID), lid < 8))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-16)&0x3f, AI(ID), lid < 16))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-32)&0x3f, AI(ID), lid < 32))); \
+    __hsail_wavebarrier(); \
+    return a; \
+}
+
+IGENA(int,32,as_int,as_uint,0)
+IGENO(int,32,min,as_int,as_uint,INT_MAX)
+IGENO(int,32,max,as_int,as_uint,INT_MIN)
+
+IGENA(uint,32,,,0U)
+IGENO(uint,32,min,,,UINT_MAX)
+IGENO(uint,32,max,,,0U)
+
+IGENA(long,64,as_long,as_ulong,0L)
+IGENO(long,64,min,as_long,as_ulong,LONG_MAX)
+IGENO(long,64,max,as_long,as_ulong,LONG_MIN)
+
+IGENA(ulong,64,,,0UL)
+IGENO(ulong,64,min,,,ULONG_MAX)
+IGENO(ulong,64,max,,,0UL)
+
+IGENA(float,32,as_float,as_uint,0.0f)
+IGENO(float,32,min,as_float,as_uint,INFINITY)
+IGENO(float,32,max,as_float,as_uint,-INFINITY)
+
+IGENA(double,64,as_double,as_ulong,0.0)
+IGENO(double,64,min,as_double,as_ulong,(double)INFINITY)
+IGENO(double,64,max,as_double,as_ulong,-(double)INFINITY)
+

diff --git a/amd-builtins/vldst/f16_f32.cl b/amd-builtins/vldst/f16_f32.cl
new file mode 100644
index 0000000..d4fddd5
--- /dev/null
+++ b/amd-builtins/vldst/f16_f32.cl

@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern float __amdil_half_to_float_f32(uint op1);
+
+extern float __amdil_float_to_half_f32(float op1);
+extern float __amdil_float_to_half_near_f32(float op1);
+extern float __amdil_float_to_half_neg_inf_f32(float op1);
+extern float __amdil_float_to_half_plus_inf_f32(float op1);
+
+// half -> float
+__attribute__((always_inline)) float
+__cvt_f16_to_f32(ushort a)
+{
+    return __amdil_half_to_float_f32((uint)a);
+}
+
+__attribute__((always_inline)) float2
+__cvt_2f16_to_2f32(ushort2 ush)
+{
+    float2 ret;
+    ret.s0 = __cvt_f16_to_f32(ush.s0);
+    ret.s1 = __cvt_f16_to_f32(ush.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) float3
+__cvt_3f16_to_3f32(ushort3 ush)
+{
+    float3 ret;
+    ret.lo = __cvt_2f16_to_2f32(ush.lo);
+    ret.s2 = __cvt_f16_to_f32(ush.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) float4
+__cvt_4f16_to_4f32(ushort4 ush)
+{
+    float4 ret;
+    ret.lo = __cvt_2f16_to_2f32(ush.lo);
+    ret.hi = __cvt_2f16_to_2f32(ush.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) float8
+__cvt_8f16_to_8f32(ushort8 ush)
+{
+    float8 ret;
+    ret.lo = __cvt_4f16_to_4f32(ush.lo);
+    ret.hi = __cvt_4f16_to_4f32(ush.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) float16
+__cvt_16f16_to_16f32(ushort16 ush)
+{
+    float16 ret;
+    ret.lo = __cvt_8f16_to_8f32(ush.lo);
+    ret.hi = __cvt_8f16_to_8f32(ush.hi);
+    return ret;
+}
+
+// float -> half rte
+__attribute__((always_inline)) ushort
+__cvt_f32_to_f16_rte(float a)
+{
+    return (ushort)as_uint(__amdil_float_to_half_near_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rte(float2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f32_to_f16_rte(f.s0);
+    ret.s1 = __cvt_f32_to_f16_rte(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rte(float3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f32_to_2f16_rte(f.lo);
+    ret.s2 = __cvt_f32_to_f16_rte(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rte(float4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f32_to_2f16_rte(f.lo);
+    ret.hi = __cvt_2f32_to_2f16_rte(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rte(float8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f32_to_4f16_rte(f.lo);
+    ret.hi = __cvt_4f32_to_4f16_rte(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rte(float16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f32_to_8f16_rte(f.lo);
+    ret.hi = __cvt_8f32_to_8f16_rte(f.hi);
+    return ret;
+}
+
+// float -> half cur
+// XXX assumes RTE
+__attribute__((always_inline)) ushort
+__cvt_f32_to_f16_cur(float f)
+{
+    return __cvt_f32_to_f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_cur(float2 f)
+{
+    return __cvt_2f32_to_2f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_cur(float3 f)
+{
+    return __cvt_3f32_to_3f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_cur(float4 f)
+{
+    return __cvt_4f32_to_4f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_cur(float8 f)
+{
+    return __cvt_8f32_to_8f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_cur(float16 f)
+{
+    return __cvt_16f32_to_16f16_rte(f);
+}
+
+//float -> half rtp
+
+ushort
+__cvt_f32_to_f16_rtp(float a)
+{
+    return (ushort)as_uint(__amdil_float_to_half_plus_inf_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rtp(float2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f32_to_f16_rtp(f.s0);
+    ret.s1 = __cvt_f32_to_f16_rtp(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rtp(float3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtp(f.lo);
+    ret.s2 = __cvt_f32_to_f16_rtp(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rtp(float4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtp(f.lo);
+    ret.hi = __cvt_2f32_to_2f16_rtp(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rtp(float8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f32_to_4f16_rtp(f.lo);
+    ret.hi = __cvt_4f32_to_4f16_rtp(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rtp(float16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f32_to_8f16_rtp(f.lo);
+    ret.hi = __cvt_8f32_to_8f16_rtp(f.hi);
+    return ret;
+}
+
+// float -> half rtn
+
+ushort
+__cvt_f32_to_f16_rtn(float a)
+{
+    return (ushort)as_uint(__amdil_float_to_half_neg_inf_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rtn(float2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f32_to_f16_rtn(f.s0);
+    ret.s1 = __cvt_f32_to_f16_rtn(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rtn(float3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtn(f.lo);
+    ret.s2 = __cvt_f32_to_f16_rtn(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rtn(float4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtn(f.lo);
+    ret.hi = __cvt_2f32_to_2f16_rtn(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rtn(float8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f32_to_4f16_rtn(f.lo);
+    ret.hi = __cvt_4f32_to_4f16_rtn(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rtn(float16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f32_to_8f16_rtn(f.lo);
+    ret.hi = __cvt_8f32_to_8f16_rtn(f.hi);
+    return ret;
+}
+
+// float -> half rtz
+
+ushort
+__cvt_f32_to_f16_rtz(float a)
+{
+    return (ushort)as_uint(__amdil_float_to_half_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rtz(float2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f32_to_f16_rtz(f.s0);
+    ret.s1 = __cvt_f32_to_f16_rtz(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rtz(float3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtz(f.lo);
+    ret.s2 = __cvt_f32_to_f16_rtz(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rtz(float4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtz(f.lo);
+    ret.hi = __cvt_2f32_to_2f16_rtz(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rtz(float8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f32_to_4f16_rtz(f.lo);
+    ret.hi = __cvt_4f32_to_4f16_rtz(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rtz(float16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f32_to_8f16_rtz(f.lo);
+    ret.hi = __cvt_8f32_to_8f16_rtz(f.hi);
+    return ret;
+}
+

diff --git a/amd-builtins/vldst/f64_f16.cl b/amd-builtins/vldst/f64_f16.cl
new file mode 100644
index 0000000..d603d8d
--- /dev/null
+++ b/amd-builtins/vldst/f64_f16.cl

@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern float __amdil_double_to_half_f64(double op1);
+extern float __amdil_double_to_half_near_f64(double op1);
+extern float __amdil_double_to_half_neg_inf_f64(double op1);
+extern float __amdil_double_to_half_plus_inf_f64(double op1);
+
+// double -> half rte
+__attribute__((always_inline)) ushort
+__cvt_f64_to_f16_rte(double a)
+{
+    return (ushort)as_uint(__amdil_double_to_half_near_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rte(double2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f64_to_f16_rte(f.s0);
+    ret.s1 = __cvt_f64_to_f16_rte(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rte(double3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f64_to_2f16_rte(f.lo);
+    ret.s2 = __cvt_f64_to_f16_rte(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rte(double4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f64_to_2f16_rte(f.lo);
+    ret.hi = __cvt_2f64_to_2f16_rte(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rte(double8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f64_to_4f16_rte(f.lo);
+    ret.hi = __cvt_4f64_to_4f16_rte(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rte(double16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f64_to_8f16_rte(f.lo);
+    ret.hi = __cvt_8f64_to_8f16_rte(f.hi);
+    return ret;
+}
+
+// double -> half cur
+// XXX assumes RTE
+__attribute__((always_inline)) ushort
+__cvt_f64_to_f16_cur(double f)
+{
+    return __cvt_f64_to_f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_cur(double2 f)
+{
+    return __cvt_2f64_to_2f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_cur(double3 f)
+{
+    return __cvt_3f64_to_3f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_cur(double4 f)
+{
+    return __cvt_4f64_to_4f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_cur(double8 f)
+{
+    return __cvt_8f64_to_8f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_cur(double16 f)
+{
+    return __cvt_16f64_to_16f16_rte(f);
+}
+
+//double -> half rtp
+
+ushort
+__cvt_f64_to_f16_rtp(double a)
+{
+    return (ushort)as_uint(__amdil_double_to_half_plus_inf_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rtp(double2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f64_to_f16_rtp(f.s0);
+    ret.s1 = __cvt_f64_to_f16_rtp(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rtp(double3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtp(f.lo);
+    ret.s2 = __cvt_f64_to_f16_rtp(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rtp(double4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtp(f.lo);
+    ret.hi = __cvt_2f64_to_2f16_rtp(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rtp(double8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f64_to_4f16_rtp(f.lo);
+    ret.hi = __cvt_4f64_to_4f16_rtp(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rtp(double16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f64_to_8f16_rtp(f.lo);
+    ret.hi = __cvt_8f64_to_8f16_rtp(f.hi);
+    return ret;
+}
+
+// double -> half rtn
+
+ushort
+__cvt_f64_to_f16_rtn(double a)
+{
+    return (ushort)as_uint(__amdil_double_to_half_neg_inf_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rtn(double2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f64_to_f16_rtn(f.s0);
+    ret.s1 = __cvt_f64_to_f16_rtn(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rtn(double3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtn(f.lo);
+    ret.s2 = __cvt_f64_to_f16_rtn(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rtn(double4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtn(f.lo);
+    ret.hi = __cvt_2f64_to_2f16_rtn(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rtn(double8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f64_to_4f16_rtn(f.lo);
+    ret.hi = __cvt_4f64_to_4f16_rtn(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rtn(double16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f64_to_8f16_rtn(f.lo);
+    ret.hi = __cvt_8f64_to_8f16_rtn(f.hi);
+    return ret;
+}
+
+// double -> half rtz
+
+ushort
+__cvt_f64_to_f16_rtz(double a)
+{
+    return (ushort)as_uint(__amdil_double_to_half_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rtz(double2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f64_to_f16_rtz(f.s0);
+    ret.s1 = __cvt_f64_to_f16_rtz(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rtz(double3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtz(f.lo);
+    ret.s2 = __cvt_f64_to_f16_rtz(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rtz(double4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtz(f.lo);
+    ret.hi = __cvt_2f64_to_2f16_rtz(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rtz(double8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f64_to_4f16_rtz(f.lo);
+    ret.hi = __cvt_4f64_to_4f16_rtz(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rtz(double16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f64_to_8f16_rtz(f.lo);
+    ret.hi = __cvt_8f64_to_8f16_rtz(f.hi);
+    return ret;
+}
+

diff --git a/amd-builtins/vldst/vldst_gen.cl b/amd-builtins/vldst/vldst_gen.cl
new file mode 100644
index 0000000..7d1f4ae
--- /dev/null
+++ b/amd-builtins/vldst/vldst_gen.cl

@@ -0,0 +1,3206 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const float *p)
+{
+    return as_float2(vload2(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const __constant float *p)
+{
+    return as_float2(vload2(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const __global float *p)
+{
+    return as_float2(vload2(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const __local float *p)
+{
+    return as_float2(vload2(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const double *p)
+{
+    return as_double2(vload2(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const __constant double *p)
+{
+    return as_double2(vload2(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const __global double *p)
+{
+    return as_double2(vload2(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const __local double *p)
+{
+    return as_double2(vload2(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const float *p)
+{
+    return as_float3(vload3(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const __constant float *p)
+{
+    return as_float3(vload3(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const __global float *p)
+{
+    return as_float3(vload3(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const __local float *p)
+{
+    return as_float3(vload3(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const double *p)
+{
+    return as_double3(vload3(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const __constant double *p)
+{
+    return as_double3(vload3(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const __global double *p)
+{
+    return as_double3(vload3(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const __local double *p)
+{
+    return as_double3(vload3(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const float *p)
+{
+    return as_float4(vload4(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const __constant float *p)
+{
+    return as_float4(vload4(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const __global float *p)
+{
+    return as_float4(vload4(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const __local float *p)
+{
+    return as_float4(vload4(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const double *p)
+{
+    return as_double4(vload4(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const __constant double *p)
+{
+    return as_double4(vload4(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const __global double *p)
+{
+    return as_double4(vload4(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const __local double *p)
+{
+    return as_double4(vload4(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const float *p)
+{
+    return as_float8(vload8(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const __constant float *p)
+{
+    return as_float8(vload8(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const __global float *p)
+{
+    return as_float8(vload8(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const __local float *p)
+{
+    return as_float8(vload8(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const double *p)
+{
+    return as_double8(vload8(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const __constant double *p)
+{
+    return as_double8(vload8(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const __global double *p)
+{
+    return as_double8(vload8(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const __local double *p)
+{
+    return as_double8(vload8(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const float *p)
+{
+    return as_float16(vload16(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const __constant float *p)
+{
+    return as_float16(vload16(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const __global float *p)
+{
+    return as_float16(vload16(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const __local float *p)
+{
+    return as_float16(vload16(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const double *p)
+{
+    return as_double16(vload16(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const __constant double *p)
+{
+    return as_double16(vload16(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const __global double *p)
+{
+    return as_double16(vload16(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const __local double *p)
+{
+    return as_double16(vload16(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(float2 v, size_t i, float *p)
+{
+    vstore2(as_int2(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(float2 v, size_t i, __global float *p)
+{
+    vstore2(as_int2(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(float2 v, size_t i, __local float *p)
+{
+    vstore2(as_int2(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(double2 v, size_t i, double *p)
+{
+    vstore2(as_long2(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(double2 v, size_t i, __global double *p)
+{
+    vstore2(as_long2(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(double2 v, size_t i, __local double *p)
+{
+    vstore2(as_long2(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(float3 v, size_t i, float *p)
+{
+    vstore3(as_int3(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(float3 v, size_t i, __global float *p)
+{
+    vstore3(as_int3(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(float3 v, size_t i, __local float *p)
+{
+    vstore3(as_int3(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(double3 v, size_t i, double *p)
+{
+    vstore3(as_long3(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(double3 v, size_t i, __global double *p)
+{
+    vstore3(as_long3(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(double3 v, size_t i, __local double *p)
+{
+    vstore3(as_long3(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(float4 v, size_t i, float *p)
+{
+    vstore4(as_int4(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(float4 v, size_t i, __global float *p)
+{
+    vstore4(as_int4(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(float4 v, size_t i, __local float *p)
+{
+    vstore4(as_int4(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(double4 v, size_t i, double *p)
+{
+    vstore4(as_long4(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(double4 v, size_t i, __global double *p)
+{
+    vstore4(as_long4(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(double4 v, size_t i, __local double *p)
+{
+    vstore4(as_long4(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(float8 v, size_t i, float *p)
+{
+    vstore8(as_int8(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(float8 v, size_t i, __global float *p)
+{
+    vstore8(as_int8(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(float8 v, size_t i, __local float *p)
+{
+    vstore8(as_int8(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(double8 v, size_t i, double *p)
+{
+    vstore8(as_long8(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(double8 v, size_t i, __global double *p)
+{
+    vstore8(as_long8(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(double8 v, size_t i, __local double *p)
+{
+    vstore8(as_long8(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(float16 v, size_t i, float *p)
+{
+    vstore16(as_int16(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(float16 v, size_t i, __global float *p)
+{
+    vstore16(as_int16(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(float16 v, size_t i, __local float *p)
+{
+    vstore16(as_int16(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(double16 v, size_t i, double *p)
+{
+    vstore16(as_long16(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(double16 v, size_t i, __global double *p)
+{
+    vstore16(as_long16(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(double16 v, size_t i, __local double *p)
+{
+    vstore16(as_long16(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((always_inline)) static char2
+vldp12(size_t i, const char *p)
+{
+    char2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp12")))  char2 vload2(size_t, const  char *);
+extern __attribute__((overloadable, weak, alias("vldp12"))) uchar2 vload2(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char2
+vldc12(size_t i, const __constant char *p)
+{
+    char2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc12")))  char2 vload2(size_t, const __constant  char *);
+extern __attribute__((overloadable, weak, alias("vldc12"))) uchar2 vload2(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char2
+vldg12(size_t i, const __global char *p)
+{
+    char2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg12")))  char2 vload2(size_t, const __global  char *);
+extern __attribute__((overloadable, weak, alias("vldg12"))) uchar2 vload2(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char2
+vldl12(size_t i, const __local char *p)
+{
+    char2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl12")))  char2 vload2(size_t, const __local  char *);
+extern __attribute__((overloadable, weak, alias("vldl12"))) uchar2 vload2(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short2
+vldp22(size_t i, const short *p)
+{
+    short2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp22")))  short2 vload2(size_t, const  short *);
+extern __attribute__((overloadable, weak, alias("vldp22"))) ushort2 vload2(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short2
+vldc22(size_t i, const __constant short *p)
+{
+    short2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc22")))  short2 vload2(size_t, const __constant  short *);
+extern __attribute__((overloadable, weak, alias("vldc22"))) ushort2 vload2(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short2
+vldg22(size_t i, const __global short *p)
+{
+    short2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg22")))  short2 vload2(size_t, const __global  short *);
+extern __attribute__((overloadable, weak, alias("vldg22"))) ushort2 vload2(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short2
+vldl22(size_t i, const __local short *p)
+{
+    short2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl22")))  short2 vload2(size_t, const __local  short *);
+extern __attribute__((overloadable, weak, alias("vldl22"))) ushort2 vload2(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int2
+vldp42(size_t i, const int *p)
+{
+    int2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp42")))  int2 vload2(size_t, const  int *);
+extern __attribute__((overloadable, weak, alias("vldp42"))) uint2 vload2(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int2
+vldc42(size_t i, const __constant int *p)
+{
+    int2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc42")))  int2 vload2(size_t, const __constant  int *);
+extern __attribute__((overloadable, weak, alias("vldc42"))) uint2 vload2(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int2
+vldg42(size_t i, const __global int *p)
+{
+    int2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg42")))  int2 vload2(size_t, const __global  int *);
+extern __attribute__((overloadable, weak, alias("vldg42"))) uint2 vload2(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int2
+vldl42(size_t i, const __local int *p)
+{
+    int2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl42")))  int2 vload2(size_t, const __local  int *);
+extern __attribute__((overloadable, weak, alias("vldl42"))) uint2 vload2(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long2
+vldp82(size_t i, const long *p)
+{
+    long2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp82")))  long2 vload2(size_t, const  long *);
+extern __attribute__((overloadable, weak, alias("vldp82"))) ulong2 vload2(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long2
+vldc82(size_t i, const __constant long *p)
+{
+    long2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc82")))  long2 vload2(size_t, const __constant  long *);
+extern __attribute__((overloadable, weak, alias("vldc82"))) ulong2 vload2(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long2
+vldg82(size_t i, const __global long *p)
+{
+    long2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg82")))  long2 vload2(size_t, const __global  long *);
+extern __attribute__((overloadable, weak, alias("vldg82"))) ulong2 vload2(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long2
+vldl82(size_t i, const __local long *p)
+{
+    long2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl82")))  long2 vload2(size_t, const __local  long *);
+extern __attribute__((overloadable, weak, alias("vldl82"))) ulong2 vload2(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char3
+vldp13(size_t i, const char *p)
+{
+    char3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp13")))  char3 vload3(size_t, const  char *);
+extern __attribute__((overloadable, weak, alias("vldp13"))) uchar3 vload3(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char3
+vldc13(size_t i, const __constant char *p)
+{
+    char3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc13")))  char3 vload3(size_t, const __constant  char *);
+extern __attribute__((overloadable, weak, alias("vldc13"))) uchar3 vload3(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char3
+vldg13(size_t i, const __global char *p)
+{
+    char3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg13")))  char3 vload3(size_t, const __global  char *);
+extern __attribute__((overloadable, weak, alias("vldg13"))) uchar3 vload3(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char3
+vldl13(size_t i, const __local char *p)
+{
+    char3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl13")))  char3 vload3(size_t, const __local  char *);
+extern __attribute__((overloadable, weak, alias("vldl13"))) uchar3 vload3(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short3
+vldp23(size_t i, const short *p)
+{
+    short3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp23")))  short3 vload3(size_t, const  short *);
+extern __attribute__((overloadable, weak, alias("vldp23"))) ushort3 vload3(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short3
+vldc23(size_t i, const __constant short *p)
+{
+    short3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc23")))  short3 vload3(size_t, const __constant  short *);
+extern __attribute__((overloadable, weak, alias("vldc23"))) ushort3 vload3(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short3
+vldg23(size_t i, const __global short *p)
+{
+    short3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg23")))  short3 vload3(size_t, const __global  short *);
+extern __attribute__((overloadable, weak, alias("vldg23"))) ushort3 vload3(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short3
+vldl23(size_t i, const __local short *p)
+{
+    short3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl23")))  short3 vload3(size_t, const __local  short *);
+extern __attribute__((overloadable, weak, alias("vldl23"))) ushort3 vload3(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int3
+vldp43(size_t i, const int *p)
+{
+    int3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp43")))  int3 vload3(size_t, const  int *);
+extern __attribute__((overloadable, weak, alias("vldp43"))) uint3 vload3(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int3
+vldc43(size_t i, const __constant int *p)
+{
+    int3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc43")))  int3 vload3(size_t, const __constant  int *);
+extern __attribute__((overloadable, weak, alias("vldc43"))) uint3 vload3(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int3
+vldg43(size_t i, const __global int *p)
+{
+    int3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg43")))  int3 vload3(size_t, const __global  int *);
+extern __attribute__((overloadable, weak, alias("vldg43"))) uint3 vload3(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int3
+vldl43(size_t i, const __local int *p)
+{
+    int3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl43")))  int3 vload3(size_t, const __local  int *);
+extern __attribute__((overloadable, weak, alias("vldl43"))) uint3 vload3(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long3
+vldp83(size_t i, const long *p)
+{
+    long3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp83")))  long3 vload3(size_t, const  long *);
+extern __attribute__((overloadable, weak, alias("vldp83"))) ulong3 vload3(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long3
+vldc83(size_t i, const __constant long *p)
+{
+    long3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc83")))  long3 vload3(size_t, const __constant  long *);
+extern __attribute__((overloadable, weak, alias("vldc83"))) ulong3 vload3(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long3
+vldg83(size_t i, const __global long *p)
+{
+    long3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg83")))  long3 vload3(size_t, const __global  long *);
+extern __attribute__((overloadable, weak, alias("vldg83"))) ulong3 vload3(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long3
+vldl83(size_t i, const __local long *p)
+{
+    long3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl83")))  long3 vload3(size_t, const __local  long *);
+extern __attribute__((overloadable, weak, alias("vldl83"))) ulong3 vload3(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char4
+vldp14(size_t i, const char *p)
+{
+    char4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp14")))  char4 vload4(size_t, const  char *);
+extern __attribute__((overloadable, weak, alias("vldp14"))) uchar4 vload4(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char4
+vldc14(size_t i, const __constant char *p)
+{
+    char4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc14")))  char4 vload4(size_t, const __constant  char *);
+extern __attribute__((overloadable, weak, alias("vldc14"))) uchar4 vload4(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char4
+vldg14(size_t i, const __global char *p)
+{
+    char4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg14")))  char4 vload4(size_t, const __global  char *);
+extern __attribute__((overloadable, weak, alias("vldg14"))) uchar4 vload4(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char4
+vldl14(size_t i, const __local char *p)
+{
+    char4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl14")))  char4 vload4(size_t, const __local  char *);
+extern __attribute__((overloadable, weak, alias("vldl14"))) uchar4 vload4(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short4
+vldp24(size_t i, const short *p)
+{
+    short4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp24")))  short4 vload4(size_t, const  short *);
+extern __attribute__((overloadable, weak, alias("vldp24"))) ushort4 vload4(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short4
+vldc24(size_t i, const __constant short *p)
+{
+    short4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc24")))  short4 vload4(size_t, const __constant  short *);
+extern __attribute__((overloadable, weak, alias("vldc24"))) ushort4 vload4(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short4
+vldg24(size_t i, const __global short *p)
+{
+    short4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg24")))  short4 vload4(size_t, const __global  short *);
+extern __attribute__((overloadable, weak, alias("vldg24"))) ushort4 vload4(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short4
+vldl24(size_t i, const __local short *p)
+{
+    short4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl24")))  short4 vload4(size_t, const __local  short *);
+extern __attribute__((overloadable, weak, alias("vldl24"))) ushort4 vload4(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int4
+vldp44(size_t i, const int *p)
+{
+    int4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp44")))  int4 vload4(size_t, const  int *);
+extern __attribute__((overloadable, weak, alias("vldp44"))) uint4 vload4(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int4
+vldc44(size_t i, const __constant int *p)
+{
+    int4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc44")))  int4 vload4(size_t, const __constant  int *);
+extern __attribute__((overloadable, weak, alias("vldc44"))) uint4 vload4(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int4
+vldg44(size_t i, const __global int *p)
+{
+    int4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg44")))  int4 vload4(size_t, const __global  int *);
+extern __attribute__((overloadable, weak, alias("vldg44"))) uint4 vload4(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int4
+vldl44(size_t i, const __local int *p)
+{
+    int4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl44")))  int4 vload4(size_t, const __local  int *);
+extern __attribute__((overloadable, weak, alias("vldl44"))) uint4 vload4(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long4
+vldp84(size_t i, const long *p)
+{
+    long4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp84")))  long4 vload4(size_t, const  long *);
+extern __attribute__((overloadable, weak, alias("vldp84"))) ulong4 vload4(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long4
+vldc84(size_t i, const __constant long *p)
+{
+    long4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc84")))  long4 vload4(size_t, const __constant  long *);
+extern __attribute__((overloadable, weak, alias("vldc84"))) ulong4 vload4(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long4
+vldg84(size_t i, const __global long *p)
+{
+    long4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg84")))  long4 vload4(size_t, const __global  long *);
+extern __attribute__((overloadable, weak, alias("vldg84"))) ulong4 vload4(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long4
+vldl84(size_t i, const __local long *p)
+{
+    long4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl84")))  long4 vload4(size_t, const __local  long *);
+extern __attribute__((overloadable, weak, alias("vldl84"))) ulong4 vload4(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char8
+vldp18(size_t i, const char *p)
+{
+    char8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp18")))  char8 vload8(size_t, const  char *);
+extern __attribute__((overloadable, weak, alias("vldp18"))) uchar8 vload8(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char8
+vldc18(size_t i, const __constant char *p)
+{
+    char8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc18")))  char8 vload8(size_t, const __constant  char *);
+extern __attribute__((overloadable, weak, alias("vldc18"))) uchar8 vload8(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char8
+vldg18(size_t i, const __global char *p)
+{
+    char8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg18")))  char8 vload8(size_t, const __global  char *);
+extern __attribute__((overloadable, weak, alias("vldg18"))) uchar8 vload8(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char8
+vldl18(size_t i, const __local char *p)
+{
+    char8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl18")))  char8 vload8(size_t, const __local  char *);
+extern __attribute__((overloadable, weak, alias("vldl18"))) uchar8 vload8(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short8
+vldp28(size_t i, const short *p)
+{
+    short8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp28")))  short8 vload8(size_t, const  short *);
+extern __attribute__((overloadable, weak, alias("vldp28"))) ushort8 vload8(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short8
+vldc28(size_t i, const __constant short *p)
+{
+    short8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc28")))  short8 vload8(size_t, const __constant  short *);
+extern __attribute__((overloadable, weak, alias("vldc28"))) ushort8 vload8(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short8
+vldg28(size_t i, const __global short *p)
+{
+    short8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg28")))  short8 vload8(size_t, const __global  short *);
+extern __attribute__((overloadable, weak, alias("vldg28"))) ushort8 vload8(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short8
+vldl28(size_t i, const __local short *p)
+{
+    short8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl28")))  short8 vload8(size_t, const __local  short *);
+extern __attribute__((overloadable, weak, alias("vldl28"))) ushort8 vload8(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int8
+vldp48(size_t i, const int *p)
+{
+    int8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp48")))  int8 vload8(size_t, const  int *);
+extern __attribute__((overloadable, weak, alias("vldp48"))) uint8 vload8(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int8
+vldc48(size_t i, const __constant int *p)
+{
+    int8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc48")))  int8 vload8(size_t, const __constant  int *);
+extern __attribute__((overloadable, weak, alias("vldc48"))) uint8 vload8(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int8
+vldg48(size_t i, const __global int *p)
+{
+    int8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg48")))  int8 vload8(size_t, const __global  int *);
+extern __attribute__((overloadable, weak, alias("vldg48"))) uint8 vload8(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int8
+vldl48(size_t i, const __local int *p)
+{
+    int8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl48")))  int8 vload8(size_t, const __local  int *);
+extern __attribute__((overloadable, weak, alias("vldl48"))) uint8 vload8(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long8
+vldp88(size_t i, const long *p)
+{
+    long8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp88")))  long8 vload8(size_t, const  long *);
+extern __attribute__((overloadable, weak, alias("vldp88"))) ulong8 vload8(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long8
+vldc88(size_t i, const __constant long *p)
+{
+    long8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc88")))  long8 vload8(size_t, const __constant  long *);
+extern __attribute__((overloadable, weak, alias("vldc88"))) ulong8 vload8(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long8
+vldg88(size_t i, const __global long *p)
+{
+    long8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg88")))  long8 vload8(size_t, const __global  long *);
+extern __attribute__((overloadable, weak, alias("vldg88"))) ulong8 vload8(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long8
+vldl88(size_t i, const __local long *p)
+{
+    long8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl88")))  long8 vload8(size_t, const __local  long *);
+extern __attribute__((overloadable, weak, alias("vldl88"))) ulong8 vload8(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char16
+vldp116(size_t i, const char *p)
+{
+    char16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp116")))  char16 vload16(size_t, const  char *);
+extern __attribute__((overloadable, weak, alias("vldp116"))) uchar16 vload16(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char16
+vldc116(size_t i, const __constant char *p)
+{
+    char16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc116")))  char16 vload16(size_t, const __constant  char *);
+extern __attribute__((overloadable, weak, alias("vldc116"))) uchar16 vload16(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char16
+vldg116(size_t i, const __global char *p)
+{
+    char16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg116")))  char16 vload16(size_t, const __global  char *);
+extern __attribute__((overloadable, weak, alias("vldg116"))) uchar16 vload16(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char16
+vldl116(size_t i, const __local char *p)
+{
+    char16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl116")))  char16 vload16(size_t, const __local  char *);
+extern __attribute__((overloadable, weak, alias("vldl116"))) uchar16 vload16(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short16
+vldp216(size_t i, const short *p)
+{
+    short16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp216")))  short16 vload16(size_t, const  short *);
+extern __attribute__((overloadable, weak, alias("vldp216"))) ushort16 vload16(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short16
+vldc216(size_t i, const __constant short *p)
+{
+    short16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc216")))  short16 vload16(size_t, const __constant  short *);
+extern __attribute__((overloadable, weak, alias("vldc216"))) ushort16 vload16(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short16
+vldg216(size_t i, const __global short *p)
+{
+    short16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg216")))  short16 vload16(size_t, const __global  short *);
+extern __attribute__((overloadable, weak, alias("vldg216"))) ushort16 vload16(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short16
+vldl216(size_t i, const __local short *p)
+{
+    short16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl216")))  short16 vload16(size_t, const __local  short *);
+extern __attribute__((overloadable, weak, alias("vldl216"))) ushort16 vload16(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int16
+vldp416(size_t i, const int *p)
+{
+    int16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp416")))  int16 vload16(size_t, const  int *);
+extern __attribute__((overloadable, weak, alias("vldp416"))) uint16 vload16(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int16
+vldc416(size_t i, const __constant int *p)
+{
+    int16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc416")))  int16 vload16(size_t, const __constant  int *);
+extern __attribute__((overloadable, weak, alias("vldc416"))) uint16 vload16(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int16
+vldg416(size_t i, const __global int *p)
+{
+    int16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg416")))  int16 vload16(size_t, const __global  int *);
+extern __attribute__((overloadable, weak, alias("vldg416"))) uint16 vload16(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int16
+vldl416(size_t i, const __local int *p)
+{
+    int16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl416")))  int16 vload16(size_t, const __local  int *);
+extern __attribute__((overloadable, weak, alias("vldl416"))) uint16 vload16(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long16
+vldp816(size_t i, const long *p)
+{
+    long16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp816")))  long16 vload16(size_t, const  long *);
+extern __attribute__((overloadable, weak, alias("vldp816"))) ulong16 vload16(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long16
+vldc816(size_t i, const __constant long *p)
+{
+    long16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc816")))  long16 vload16(size_t, const __constant  long *);
+extern __attribute__((overloadable, weak, alias("vldc816"))) ulong16 vload16(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long16
+vldg816(size_t i, const __global long *p)
+{
+    long16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg816")))  long16 vload16(size_t, const __global  long *);
+extern __attribute__((overloadable, weak, alias("vldg816"))) ulong16 vload16(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long16
+vldl816(size_t i, const __local long *p)
+{
+    long16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl816")))  long16 vload16(size_t, const __local  long *);
+extern __attribute__((overloadable, weak, alias("vldl816"))) ulong16 vload16(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp12(char2 v, size_t i, char *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp12"))) void vstore2( char2, size_t,  char *);
+extern __attribute__((overloadable, weak, alias("vstp12"))) void vstore2(uchar2, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg12(char2 v, size_t i, __global char *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg12"))) void vstore2( char2, size_t, __global  char *);
+extern __attribute__((overloadable, weak, alias("vstg12"))) void vstore2(uchar2, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl12(char2 v, size_t i, __local char *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl12"))) void vstore2( char2, size_t, __local  char *);
+extern __attribute__((overloadable, weak, alias("vstl12"))) void vstore2(uchar2, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp22(short2 v, size_t i, short *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp22"))) void vstore2( short2, size_t,  short *);
+extern __attribute__((overloadable, weak, alias("vstp22"))) void vstore2(ushort2, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg22(short2 v, size_t i, __global short *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg22"))) void vstore2( short2, size_t, __global  short *);
+extern __attribute__((overloadable, weak, alias("vstg22"))) void vstore2(ushort2, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl22(short2 v, size_t i, __local short *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl22"))) void vstore2( short2, size_t, __local  short *);
+extern __attribute__((overloadable, weak, alias("vstl22"))) void vstore2(ushort2, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp42(int2 v, size_t i, int *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp42"))) void vstore2( int2, size_t,  int *);
+extern __attribute__((overloadable, weak, alias("vstp42"))) void vstore2(uint2, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg42(int2 v, size_t i, __global int *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg42"))) void vstore2( int2, size_t, __global  int *);
+extern __attribute__((overloadable, weak, alias("vstg42"))) void vstore2(uint2, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl42(int2 v, size_t i, __local int *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl42"))) void vstore2( int2, size_t, __local  int *);
+extern __attribute__((overloadable, weak, alias("vstl42"))) void vstore2(uint2, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp82(long2 v, size_t i, long *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp82"))) void vstore2( long2, size_t,  long *);
+extern __attribute__((overloadable, weak, alias("vstp82"))) void vstore2(ulong2, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg82(long2 v, size_t i, __global long *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg82"))) void vstore2( long2, size_t, __global  long *);
+extern __attribute__((overloadable, weak, alias("vstg82"))) void vstore2(ulong2, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl82(long2 v, size_t i, __local long *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl82"))) void vstore2( long2, size_t, __local  long *);
+extern __attribute__((overloadable, weak, alias("vstl82"))) void vstore2(ulong2, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp13(char3 v, size_t i, char *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp13"))) void vstore3( char3, size_t,  char *);
+extern __attribute__((overloadable, weak, alias("vstp13"))) void vstore3(uchar3, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg13(char3 v, size_t i, __global char *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg13"))) void vstore3( char3, size_t, __global  char *);
+extern __attribute__((overloadable, weak, alias("vstg13"))) void vstore3(uchar3, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl13(char3 v, size_t i, __local char *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl13"))) void vstore3( char3, size_t, __local  char *);
+extern __attribute__((overloadable, weak, alias("vstl13"))) void vstore3(uchar3, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp23(short3 v, size_t i, short *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp23"))) void vstore3( short3, size_t,  short *);
+extern __attribute__((overloadable, weak, alias("vstp23"))) void vstore3(ushort3, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg23(short3 v, size_t i, __global short *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg23"))) void vstore3( short3, size_t, __global  short *);
+extern __attribute__((overloadable, weak, alias("vstg23"))) void vstore3(ushort3, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl23(short3 v, size_t i, __local short *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl23"))) void vstore3( short3, size_t, __local  short *);
+extern __attribute__((overloadable, weak, alias("vstl23"))) void vstore3(ushort3, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp43(int3 v, size_t i, int *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp43"))) void vstore3( int3, size_t,  int *);
+extern __attribute__((overloadable, weak, alias("vstp43"))) void vstore3(uint3, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg43(int3 v, size_t i, __global int *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg43"))) void vstore3( int3, size_t, __global  int *);
+extern __attribute__((overloadable, weak, alias("vstg43"))) void vstore3(uint3, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl43(int3 v, size_t i, __local int *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl43"))) void vstore3( int3, size_t, __local  int *);
+extern __attribute__((overloadable, weak, alias("vstl43"))) void vstore3(uint3, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp83(long3 v, size_t i, long *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp83"))) void vstore3( long3, size_t,  long *);
+extern __attribute__((overloadable, weak, alias("vstp83"))) void vstore3(ulong3, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg83(long3 v, size_t i, __global long *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg83"))) void vstore3( long3, size_t, __global  long *);
+extern __attribute__((overloadable, weak, alias("vstg83"))) void vstore3(ulong3, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl83(long3 v, size_t i, __local long *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl83"))) void vstore3( long3, size_t, __local  long *);
+extern __attribute__((overloadable, weak, alias("vstl83"))) void vstore3(ulong3, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp14(char4 v, size_t i, char *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp14"))) void vstore4( char4, size_t,  char *);
+extern __attribute__((overloadable, weak, alias("vstp14"))) void vstore4(uchar4, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg14(char4 v, size_t i, __global char *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg14"))) void vstore4( char4, size_t, __global  char *);
+extern __attribute__((overloadable, weak, alias("vstg14"))) void vstore4(uchar4, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl14(char4 v, size_t i, __local char *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl14"))) void vstore4( char4, size_t, __local  char *);
+extern __attribute__((overloadable, weak, alias("vstl14"))) void vstore4(uchar4, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp24(short4 v, size_t i, short *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp24"))) void vstore4( short4, size_t,  short *);
+extern __attribute__((overloadable, weak, alias("vstp24"))) void vstore4(ushort4, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg24(short4 v, size_t i, __global short *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg24"))) void vstore4( short4, size_t, __global  short *);
+extern __attribute__((overloadable, weak, alias("vstg24"))) void vstore4(ushort4, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl24(short4 v, size_t i, __local short *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl24"))) void vstore4( short4, size_t, __local  short *);
+extern __attribute__((overloadable, weak, alias("vstl24"))) void vstore4(ushort4, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp44(int4 v, size_t i, int *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp44"))) void vstore4( int4, size_t,  int *);
+extern __attribute__((overloadable, weak, alias("vstp44"))) void vstore4(uint4, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg44(int4 v, size_t i, __global int *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg44"))) void vstore4( int4, size_t, __global  int *);
+extern __attribute__((overloadable, weak, alias("vstg44"))) void vstore4(uint4, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl44(int4 v, size_t i, __local int *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl44"))) void vstore4( int4, size_t, __local  int *);
+extern __attribute__((overloadable, weak, alias("vstl44"))) void vstore4(uint4, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp84(long4 v, size_t i, long *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp84"))) void vstore4( long4, size_t,  long *);
+extern __attribute__((overloadable, weak, alias("vstp84"))) void vstore4(ulong4, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg84(long4 v, size_t i, __global long *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg84"))) void vstore4( long4, size_t, __global  long *);
+extern __attribute__((overloadable, weak, alias("vstg84"))) void vstore4(ulong4, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl84(long4 v, size_t i, __local long *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl84"))) void vstore4( long4, size_t, __local  long *);
+extern __attribute__((overloadable, weak, alias("vstl84"))) void vstore4(ulong4, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp18(char8 v, size_t i, char *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp18"))) void vstore8( char8, size_t,  char *);
+extern __attribute__((overloadable, weak, alias("vstp18"))) void vstore8(uchar8, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg18(char8 v, size_t i, __global char *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg18"))) void vstore8( char8, size_t, __global  char *);
+extern __attribute__((overloadable, weak, alias("vstg18"))) void vstore8(uchar8, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl18(char8 v, size_t i, __local char *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl18"))) void vstore8( char8, size_t, __local  char *);
+extern __attribute__((overloadable, weak, alias("vstl18"))) void vstore8(uchar8, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp28(short8 v, size_t i, short *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp28"))) void vstore8( short8, size_t,  short *);
+extern __attribute__((overloadable, weak, alias("vstp28"))) void vstore8(ushort8, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg28(short8 v, size_t i, __global short *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg28"))) void vstore8( short8, size_t, __global  short *);
+extern __attribute__((overloadable, weak, alias("vstg28"))) void vstore8(ushort8, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl28(short8 v, size_t i, __local short *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl28"))) void vstore8( short8, size_t, __local  short *);
+extern __attribute__((overloadable, weak, alias("vstl28"))) void vstore8(ushort8, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp48(int8 v, size_t i, int *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp48"))) void vstore8( int8, size_t,  int *);
+extern __attribute__((overloadable, weak, alias("vstp48"))) void vstore8(uint8, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg48(int8 v, size_t i, __global int *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg48"))) void vstore8( int8, size_t, __global  int *);
+extern __attribute__((overloadable, weak, alias("vstg48"))) void vstore8(uint8, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl48(int8 v, size_t i, __local int *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl48"))) void vstore8( int8, size_t, __local  int *);
+extern __attribute__((overloadable, weak, alias("vstl48"))) void vstore8(uint8, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp88(long8 v, size_t i, long *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp88"))) void vstore8( long8, size_t,  long *);
+extern __attribute__((overloadable, weak, alias("vstp88"))) void vstore8(ulong8, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg88(long8 v, size_t i, __global long *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg88"))) void vstore8( long8, size_t, __global  long *);
+extern __attribute__((overloadable, weak, alias("vstg88"))) void vstore8(ulong8, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl88(long8 v, size_t i, __local long *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl88"))) void vstore8( long8, size_t, __local  long *);
+extern __attribute__((overloadable, weak, alias("vstl88"))) void vstore8(ulong8, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp116(char16 v, size_t i, char *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp116"))) void vstore16( char16, size_t,  char *);
+extern __attribute__((overloadable, weak, alias("vstp116"))) void vstore16(uchar16, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg116(char16 v, size_t i, __global char *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg116"))) void vstore16( char16, size_t, __global  char *);
+extern __attribute__((overloadable, weak, alias("vstg116"))) void vstore16(uchar16, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl116(char16 v, size_t i, __local char *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl116"))) void vstore16( char16, size_t, __local  char *);
+extern __attribute__((overloadable, weak, alias("vstl116"))) void vstore16(uchar16, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp216(short16 v, size_t i, short *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp216"))) void vstore16( short16, size_t,  short *);
+extern __attribute__((overloadable, weak, alias("vstp216"))) void vstore16(ushort16, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg216(short16 v, size_t i, __global short *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg216"))) void vstore16( short16, size_t, __global  short *);
+extern __attribute__((overloadable, weak, alias("vstg216"))) void vstore16(ushort16, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl216(short16 v, size_t i, __local short *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl216"))) void vstore16( short16, size_t, __local  short *);
+extern __attribute__((overloadable, weak, alias("vstl216"))) void vstore16(ushort16, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp416(int16 v, size_t i, int *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp416"))) void vstore16( int16, size_t,  int *);
+extern __attribute__((overloadable, weak, alias("vstp416"))) void vstore16(uint16, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg416(int16 v, size_t i, __global int *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg416"))) void vstore16( int16, size_t, __global  int *);
+extern __attribute__((overloadable, weak, alias("vstg416"))) void vstore16(uint16, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl416(int16 v, size_t i, __local int *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl416"))) void vstore16( int16, size_t, __local  int *);
+extern __attribute__((overloadable, weak, alias("vstl416"))) void vstore16(uint16, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp816(long16 v, size_t i, long *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp816"))) void vstore16( long16, size_t,  long *);
+extern __attribute__((overloadable, weak, alias("vstp816"))) void vstore16(ulong16, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg816(long16 v, size_t i, __global long *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg816"))) void vstore16( long16, size_t, __global  long *);
+extern __attribute__((overloadable, weak, alias("vstg816"))) void vstore16(ulong16, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl816(long16 v, size_t i, __local long *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl816"))) void vstore16( long16, size_t, __local  long *);
+extern __attribute__((overloadable, weak, alias("vstl816"))) void vstore16(ulong16, size_t, __local ulong *);
+#endif
+

diff --git a/amd-builtins/vldst/vldst_half.cl b/amd-builtins/vldst/vldst_half.cl
new file mode 100644
index 0000000..069cded
--- /dev/null
+++ b/amd-builtins/vldst/vldst_half.cl

@@ -0,0 +1,4237 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhp(size_t i, const half *p)
+{
+    ushort h = *(const short *)(p + i);
+    return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhp"))) float  vload_half(size_t, const half *);
+extern __attribute__((overloadable, weak, alias("vldhp"))) float vloada_half(size_t, const half *);
+
+
+
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhc(size_t i, const __constant half *p)
+{
+    ushort h = *(const __constant short *)(p + i);
+    return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhc"))) float  vload_half(size_t, const __constant half *);
+extern __attribute__((overloadable, weak, alias("vldhc"))) float vloada_half(size_t, const __constant half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhg(size_t i, const __global half *p)
+{
+    ushort h = *(const __global short *)(p + i);
+    return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhg"))) float  vload_half(size_t, const __global half *);
+extern __attribute__((overloadable, weak, alias("vldhg"))) float vloada_half(size_t, const __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhl(size_t i, const __local half *p)
+{
+    ushort h = *(const __local short *)(p + i);
+    return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhl"))) float  vload_half(size_t, const __local half *);
+extern __attribute__((overloadable, weak, alias("vldhl"))) float vloada_half(size_t, const __local half *);
+#endif
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const half *p)
+{
+    return __cvt_2f16_to_2f32(vload2(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const __constant half *p)
+{
+    return __cvt_2f16_to_2f32(vload2(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const __global half *p)
+{
+    return __cvt_2f16_to_2f32(vload2(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const __local half *p)
+{
+    return __cvt_2f16_to_2f32(vload2(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const half *p)
+{
+    return __cvt_3f16_to_3f32(vload3(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const __constant half *p)
+{
+    return __cvt_3f16_to_3f32(vload3(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const __global half *p)
+{
+    return __cvt_3f16_to_3f32(vload3(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const __local half *p)
+{
+    return __cvt_3f16_to_3f32(vload3(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const half *p)
+{
+    return __cvt_4f16_to_4f32(vload4(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const __constant half *p)
+{
+    return __cvt_4f16_to_4f32(vload4(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const __global half *p)
+{
+    return __cvt_4f16_to_4f32(vload4(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const __local half *p)
+{
+    return __cvt_4f16_to_4f32(vload4(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const half *p)
+{
+    return __cvt_8f16_to_8f32(vload8(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const __constant half *p)
+{
+    return __cvt_8f16_to_8f32(vload8(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const __global half *p)
+{
+    return __cvt_8f16_to_8f32(vload8(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const __local half *p)
+{
+    return __cvt_8f16_to_8f32(vload8(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const half *p)
+{
+    return __cvt_16f16_to_16f32(vload16(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const __constant half *p)
+{
+    return __cvt_16f16_to_16f32(vload16(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const __global half *p)
+{
+    return __cvt_16f16_to_16f32(vload16(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const __local half *p)
+{
+    return __cvt_16f16_to_16f32(vload16(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const half *p)
+{
+
+    return __cvt_2f16_to_2f32(*(const ushort2 *)(p + i * 2));
+
+}
+
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const __constant half *p)
+{
+
+    return __cvt_2f16_to_2f32(*(const __constant ushort2 *)(p + i * 2));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const __global half *p)
+{
+
+    return __cvt_2f16_to_2f32(*(const __global ushort2 *)(p + i * 2));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const __local half *p)
+{
+
+    return __cvt_2f16_to_2f32(*(const __local ushort2 *)(p + i * 2));
+
+}
+#endif
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const half *p)
+{
+
+    ushort4 h = *(const ushort4 *)(p + i * 4);
+    return __cvt_3f16_to_3f32(h.s012);
+
+}
+
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const __constant half *p)
+{
+
+    ushort4 h = *(const __constant ushort4 *)(p + i * 4);
+    return __cvt_3f16_to_3f32(h.s012);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const __global half *p)
+{
+
+    ushort4 h = *(const __global ushort4 *)(p + i * 4);
+    return __cvt_3f16_to_3f32(h.s012);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const __local half *p)
+{
+
+    ushort4 h = *(const __local ushort4 *)(p + i * 4);
+    return __cvt_3f16_to_3f32(h.s012);
+
+}
+#endif
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const half *p)
+{
+
+    return __cvt_4f16_to_4f32(*(const ushort4 *)(p + i * 4));
+
+}
+
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const __constant half *p)
+{
+
+    return __cvt_4f16_to_4f32(*(const __constant ushort4 *)(p + i * 4));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const __global half *p)
+{
+
+    return __cvt_4f16_to_4f32(*(const __global ushort4 *)(p + i * 4));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const __local half *p)
+{
+
+    return __cvt_4f16_to_4f32(*(const __local ushort4 *)(p + i * 4));
+
+}
+#endif
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const half *p)
+{
+
+    return __cvt_8f16_to_8f32(*(const ushort8 *)(p + i * 8));
+
+}
+
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const __constant half *p)
+{
+
+    return __cvt_8f16_to_8f32(*(const __constant ushort8 *)(p + i * 8));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const __global half *p)
+{
+
+    return __cvt_8f16_to_8f32(*(const __global ushort8 *)(p + i * 8));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const __local half *p)
+{
+
+    return __cvt_8f16_to_8f32(*(const __local ushort8 *)(p + i * 8));
+
+}
+#endif
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const half *p)
+{
+
+    return __cvt_16f16_to_16f32(*(const ushort16 *)(p + i * 16));
+
+}
+
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const __constant half *p)
+{
+
+    return __cvt_16f16_to_16f32(*(const __constant ushort16 *)(p + i * 16));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const __global half *p)
+{
+
+    return __cvt_16f16_to_16f32(*(const __global ushort16 *)(p + i * 16));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const __local half *p)
+{
+
+    return __cvt_16f16_to_16f32(*(const __local ushort16 *)(p + i * 16));
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float);
+
+__attribute__((always_inline)) static void
+vsthpf32c(float v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f32_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32c")))  void vstore_half(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32c"))) void vstorea_half(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float);
+
+__attribute__((always_inline)) static void
+vsthgf32c(float v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f32_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32c")))  void vstore_half(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32c"))) void vstorea_half(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float);
+
+__attribute__((always_inline)) static void
+vsthlf32c(float v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f32_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32c")))  void vstore_half(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32c"))) void vstorea_half(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float);
+
+__attribute__((always_inline)) static void
+vsthpf32e(float v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f32_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32e")))  void vstore_half_rte(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32e"))) void vstorea_half_rte(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float);
+
+__attribute__((always_inline)) static void
+vsthgf32e(float v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f32_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32e")))  void vstore_half_rte(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32e"))) void vstorea_half_rte(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float);
+
+__attribute__((always_inline)) static void
+vsthlf32e(float v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f32_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32e")))  void vstore_half_rte(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32e"))) void vstorea_half_rte(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float);
+
+__attribute__((always_inline)) static void
+vsthpf32p(float v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f32_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32p")))  void vstore_half_rtp(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32p"))) void vstorea_half_rtp(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float);
+
+__attribute__((always_inline)) static void
+vsthgf32p(float v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32p")))  void vstore_half_rtp(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32p"))) void vstorea_half_rtp(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float);
+
+__attribute__((always_inline)) static void
+vsthlf32p(float v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32p")))  void vstore_half_rtp(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32p"))) void vstorea_half_rtp(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float);
+
+__attribute__((always_inline)) static void
+vsthpf32n(float v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f32_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32n")))  void vstore_half_rtn(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32n"))) void vstorea_half_rtn(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float);
+
+__attribute__((always_inline)) static void
+vsthgf32n(float v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32n")))  void vstore_half_rtn(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32n"))) void vstorea_half_rtn(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float);
+
+__attribute__((always_inline)) static void
+vsthlf32n(float v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32n")))  void vstore_half_rtn(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32n"))) void vstorea_half_rtn(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float);
+
+__attribute__((always_inline)) static void
+vsthpf32z(float v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f32_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32z")))  void vstore_half_rtz(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32z"))) void vstorea_half_rtz(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float);
+
+__attribute__((always_inline)) static void
+vsthgf32z(float v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32z")))  void vstore_half_rtz(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32z"))) void vstorea_half_rtz(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float);
+
+__attribute__((always_inline)) static void
+vsthlf32z(float v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32z")))  void vstore_half_rtz(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32z"))) void vstorea_half_rtz(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double);
+
+__attribute__((always_inline)) static void
+vsthpf64c(double v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f64_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64c")))  void vstore_half(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64c"))) void vstorea_half(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double);
+
+__attribute__((always_inline)) static void
+vsthgf64c(double v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f64_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64c")))  void vstore_half(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64c"))) void vstorea_half(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double);
+
+__attribute__((always_inline)) static void
+vsthlf64c(double v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f64_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64c")))  void vstore_half(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64c"))) void vstorea_half(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double);
+
+__attribute__((always_inline)) static void
+vsthpf64e(double v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f64_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64e")))  void vstore_half_rte(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64e"))) void vstorea_half_rte(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double);
+
+__attribute__((always_inline)) static void
+vsthgf64e(double v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f64_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64e")))  void vstore_half_rte(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64e"))) void vstorea_half_rte(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double);
+
+__attribute__((always_inline)) static void
+vsthlf64e(double v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f64_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64e")))  void vstore_half_rte(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64e"))) void vstorea_half_rte(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double);
+
+__attribute__((always_inline)) static void
+vsthpf64p(double v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f64_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64p")))  void vstore_half_rtp(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64p"))) void vstorea_half_rtp(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double);
+
+__attribute__((always_inline)) static void
+vsthgf64p(double v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64p")))  void vstore_half_rtp(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64p"))) void vstorea_half_rtp(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double);
+
+__attribute__((always_inline)) static void
+vsthlf64p(double v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64p")))  void vstore_half_rtp(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64p"))) void vstorea_half_rtp(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double);
+
+__attribute__((always_inline)) static void
+vsthpf64n(double v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f64_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64n")))  void vstore_half_rtn(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64n"))) void vstorea_half_rtn(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double);
+
+__attribute__((always_inline)) static void
+vsthgf64n(double v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64n")))  void vstore_half_rtn(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64n"))) void vstorea_half_rtn(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double);
+
+__attribute__((always_inline)) static void
+vsthlf64n(double v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64n")))  void vstore_half_rtn(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64n"))) void vstorea_half_rtn(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double);
+
+__attribute__((always_inline)) static void
+vsthpf64z(double v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f64_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64z")))  void vstore_half_rtz(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64z"))) void vstorea_half_rtz(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double);
+
+__attribute__((always_inline)) static void
+vsthgf64z(double v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64z")))  void vstore_half_rtz(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64z"))) void vstorea_half_rtz(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double);
+
+__attribute__((always_inline)) static void
+vsthlf64z(double v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64z")))  void vstore_half_rtz(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64z"))) void vstorea_half_rtz(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(float2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(float2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(float2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(float2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(float2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(float2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(float2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(float2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(float2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(float2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(float2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(float2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(float2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(float2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(float2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(float3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(float3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(float3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(float3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(float3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(float3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(float3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(float3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(float3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(float3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(float3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(float3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(float3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(float3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(float3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(float4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(float4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(float4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(float4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(float4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(float4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(float4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(float4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(float4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(float4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(float4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(float4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(float4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(float4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(float4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(float8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(float8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(float8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(float8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(float8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(float8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(float8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(float8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(float8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(float8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(float8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(float8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(float8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(float8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(float8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(float16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(float16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(float16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(float16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(float16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(float16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(float16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(float16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(float16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(float16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(float16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(float16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(float16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(float16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(float16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(double2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(double2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(double2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(double2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(double2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(double2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(double2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(double2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(double2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(double2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(double2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(double2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(double2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(double2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(double2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(double3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(double3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(double3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(double3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(double3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(double3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(double3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(double3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(double3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(double3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(double3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(double3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(double3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(double3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(double3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(double4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(double4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(double4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(double4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(double4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(double4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(double4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(double4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(double4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(double4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(double4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(double4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(double4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(double4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(double4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(double8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(double8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(double8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(double8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(double8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(double8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(double8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(double8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(double8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(double8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(double8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(double8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(double8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(double8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(double8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(double16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(double16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(double16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(double16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(double16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(double16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(double16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(double16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(double16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(double16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(double16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(double16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(double16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(double16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(double16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(float2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(float2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(float2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(float2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(float2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(float2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(float2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(float2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(float2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(float2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(float2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(float2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(float2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(float2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(float2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(float3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_cur(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(float3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_cur(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(float3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_cur(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(float3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rte(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(float3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rte(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(float3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rte(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(float3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtp(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(float3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtp(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(float3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtp(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(float3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtn(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(float3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtn(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(float3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtn(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(float3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtz(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(float3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtz(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(float3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtz(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(float4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(float4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(float4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(float4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(float4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(float4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(float4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(float4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(float4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(float4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(float4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(float4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(float4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(float4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(float4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(float8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(float8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(float8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(float8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(float8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(float8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(float8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(float8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(float8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(float8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(float8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(float8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(float8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(float8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(float8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(float16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(float16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(float16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(float16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(float16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(float16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(float16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(float16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(float16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(float16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(float16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(float16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(float16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(float16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(float16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(double2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(double2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(double2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(double2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(double2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(double2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(double2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(double2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(double2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(double2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(double2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(double2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(double2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(double2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(double2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(double3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_cur(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(double3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_cur(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(double3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_cur(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(double3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rte(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(double3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rte(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(double3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rte(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(double3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtp(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(double3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtp(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(double3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtp(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(double3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtn(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(double3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtn(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(double3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtn(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(double3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtz(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(double3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtz(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(double3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtz(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(double4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(double4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(double4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(double4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(double4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(double4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(double4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(double4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(double4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(double4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(double4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(double4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(double4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(double4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(double4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(double8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(double8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(double8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(double8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(double8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(double8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(double8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(double8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(double8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(double8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(double8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(double8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(double8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(double8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(double8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(double16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(double16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(double16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(double16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(double16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(double16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(double16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(double16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(double16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(double16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(double16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(double16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(double16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(double16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(double16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v);
+
+}
+#endif

diff --git a/amd-builtins/workgroup/wg.h b/amd-builtins/workgroup/wg.h
new file mode 100644
index 0000000..f3d969f
--- /dev/null
+++ b/amd-builtins/workgroup/wg.h

@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// XXX The runtime computes CL_DEVICE_MAX_WORK_GROUP_SIZE as
+// XXX dev->wave_front_size * dev->max_waves_per_simd
+// XXX If max_waves_per_simd is ever raised then this code will need to be updated
+#define MAX_WAVES_PER_SIMD  4
+
+#pragma OPENCL EXTENSION cl_amd_program_scope_locals : enable
+extern __local ulong __wg_scratch[MAX_WAVES_PER_SIMD];
+

diff --git a/amd-builtins/workgroup/wganyall.cl b/amd-builtins/workgroup/wganyall.cl
new file mode 100644
index 0000000..2daa659
--- /dev/null
+++ b/amd-builtins/workgroup/wganyall.cl

@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "wg.h"
+
+#define GEN_AA(SUF,ID) \
+__attribute__((overloadable, always_inline)) int \
+work_group_##SUF(int predicate) \
+{ \
+    uint n = get_num_sub_groups(); \
+    int a = sub_group_##SUF(predicate); \
+    if (n == 1) \
+	return a; \
+ \
+    __local int *p = (__local int *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == 0) \
+	p[i] = a; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	a = l < n ? p[l] : ID; \
+	a = sub_group_##SUF(a); \
+	if (l == 0) \
+	    p[0] = a; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = p[0]; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    return a; \
+}
+
+GEN_AA(all, 1U)
+GEN_AA(any, 0U);
+
+#endif
+

diff --git a/amd-builtins/workgroup/wgbarrier.cl b/amd-builtins/workgroup/wgbarrier.cl
new file mode 100644
index 0000000..a682e21
--- /dev/null
+++ b/amd-builtins/workgroup/wgbarrier.cl

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+extern void __hsail_barrier(void);
+
+__attribute__((overloadable, weak, always_inline)) void
+work_group_barrier(cl_mem_fence_flags flags, memory_scope scope)
+{
+    atomic_work_item_fence(flags, memory_order_release, scope);
+    __hsail_barrier();
+    atomic_work_item_fence(flags, memory_order_acquire, scope);
+}
+
+__attribute__((overloadable, weak, always_inline)) void
+work_group_barrier(cl_mem_fence_flags flags)
+{
+    work_group_barrier(flags, memory_scope_work_group);
+}
+
+#endif
+

diff --git a/amd-builtins/workgroup/wgbcast.cl b/amd-builtins/workgroup/wgbcast.cl
new file mode 100644
index 0000000..f279a1f
--- /dev/null
+++ b/amd-builtins/workgroup/wgbcast.cl

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "wg.h"
+
+#define GEN_BROADCAST(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_broadcast(TYPE a, size_t local_id_x) \
+{ \
+    if (get_num_sub_groups() == 1) \
+        return sub_group_broadcast(a, local_id_x); \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    if (get_local_id(0) == local_id_x) \
+        *p = a; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = *p; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return a; \
+} \
+\
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_broadcast(TYPE a, size_t local_id_x, size_t local_id_y) \
+{ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    if (get_local_id(0) == local_id_x && get_local_id(1) == local_id_y) \
+        *p = a; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = *p; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return a; \
+} \
+\
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_broadcast(TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z) \
+{ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    if (get_local_id(0) == local_id_x && get_local_id(1) == local_id_y && get_local_id(2) == local_id_z) \
+        *p = a; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = *p; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return a; \
+}
+
+GEN_BROADCAST(uint)
+GEN_BROADCAST(int)
+GEN_BROADCAST(ulong)
+GEN_BROADCAST(long)
+GEN_BROADCAST(float)
+GEN_BROADCAST(double)
+
+#endif
+

diff --git a/amd-builtins/workgroup/wgreduce.cl b/amd-builtins/workgroup/wgreduce.cl
new file mode 100644
index 0000000..6ad4dec
--- /dev/null
+++ b/amd-builtins/workgroup/wgreduce.cl

@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "wg.h"
+
+#define GENA(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_reduce_add(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    a = sub_group_reduce_add(a); \
+    if (n == 1) \
+        return a; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == 0) \
+        p[i] = a; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	a = l < n ? p[l] : (TYPE)0; \
+	a = sub_group_reduce_add(a); \
+	if (l == 0) \
+	    p[0] = a; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = p[0]; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return a; \
+}
+
+#define GENO(TYPE,SUF,ID) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_reduce_##SUF(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    a = sub_group_reduce_##SUF(a); \
+    if (n == 1) \
+        return a; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == 0) \
+        p[i] = a; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	a = l < n ? p[l] : ID; \
+	a = sub_group_reduce_##SUF(a); \
+	if (l == 0) \
+	    p[0] = a; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = p[0]; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return a; \
+}
+
+GENA(int)
+GENA(uint)
+GENA(long)
+GENA(ulong)
+GENA(float)
+GENA(double)
+
+GENO(int,max,INT_MIN)
+GENO(uint,max,0U)
+GENO(long,max,LONG_MIN)
+GENO(ulong,max,0UL)
+GENO(float,max,-INFINITY)
+GENO(double,max,-(double)INFINITY)
+
+GENO(int,min,INT_MAX)
+GENO(uint,min,UINT_MAX)
+GENO(long,min,LONG_MAX)
+GENO(ulong,min,ULONG_MAX)
+GENO(float,min,INFINITY)
+GENO(double,min,(double)INFINITY)
+
+#endif
+

diff --git a/amd-builtins/workgroup/wgscan.cl b/amd-builtins/workgroup/wgscan.cl
new file mode 100644
index 0000000..f3b4606
--- /dev/null
+++ b/amd-builtins/workgroup/wgscan.cl

@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "wg.h"
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#define GENIA(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_inclusive_add(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    a = sub_group_scan_inclusive_add(a); \
+    if (n == 1) \
+        return a; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == get_sub_group_size() - 1U) \
+	p[i] = a; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	TYPE t = l < n ? p[l] : (TYPE)0; \
+	t = sub_group_scan_inclusive_add(t); \
+	if (l < n) \
+	    p[l] = t; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    TYPE ret = i == 0 ? a : a + p[i-1]; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return ret; \
+}
+
+#define GENIO(TYPE,SUF,ID) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_inclusive_##SUF(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    a = sub_group_scan_inclusive_##SUF(a); \
+    if (n == 1) \
+        return a; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == get_sub_group_size() - 1U) \
+	p[i] = a; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	TYPE t = l < n ? p[l] : ID; \
+	t = sub_group_scan_inclusive_##SUF(t); \
+	if (l < n) \
+	    p[l] = t; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    TYPE ret = i == 0 ? a : SUF(a, p[i-1]); \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return ret; \
+}
+
+GENIA(int)
+GENIA(uint)
+GENIA(long)
+GENIA(ulong)
+GENIA(float)
+GENIA(double)
+
+GENIO(int,max,INT_MIN)
+GENIO(uint,max,0U)
+GENIO(long,max,LONG_MIN)
+GENIO(ulong,max,0UL)
+GENIO(float,max,-INFINITY)
+GENIO(double,max,-(double)INFINITY)
+
+GENIO(int,min,INT_MAX)
+GENIO(uint,min,UINT_MAX)
+GENIO(long,min,LONG_MAX)
+GENIO(ulong,min,ULONG_MAX)
+GENIO(float,min,INFINITY)
+GENIO(double,min,(double)INFINITY)
+
+#define GENEA(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_exclusive_add(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    TYPE t = sub_group_scan_exclusive_add(a); \
+    if (n == 1) \
+        return t; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == get_sub_group_size() - 1U) \
+	p[i] = a + t; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	TYPE s = l < n ? p[l] : (TYPE)0; \
+	s = sub_group_scan_inclusive_add(s); \
+	if (l < n) \
+	    p[l] = s; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    TYPE ret = i == 0 ? t : t + p[i-1]; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return ret; \
+}
+
+#define GENEO(TYPE,SUF,ID) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_exclusive_##SUF(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    TYPE t = sub_group_scan_exclusive_##SUF(a); \
+    if (n == 1) \
+        return t; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == get_sub_group_size() - 1U) \
+	p[i] = SUF(a, t); \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	TYPE s = l < n ? p[l] : ID; \
+	s = sub_group_scan_inclusive_##SUF(s); \
+	if (l < n) \
+	    p[l] = s; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    TYPE ret = i == 0 ? t : SUF(t, p[i-1]); \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return ret; \
+}
+
+GENEA(int)
+GENEA(uint)
+GENEA(long)
+GENEA(ulong)
+GENEA(float)
+GENEA(double)
+
+GENEO(int,max,INT_MIN)
+GENEO(uint,max,0U)
+GENEO(long,max,LONG_MIN)
+GENEO(ulong,max,0UL)
+GENEO(float,max,-INFINITY)
+GENEO(double,max,-(double)INFINITY)
+
+GENEO(int,min,INT_MAX)
+GENEO(uint,min,UINT_MAX)
+GENEO(long,min,LONG_MAX)
+GENEO(ulong,min,ULONG_MAX)
+GENEO(float,min,INFINITY)
+GENEO(double,min,(double)INFINITY)
+
+#endif
+

diff --git a/amd-builtins/workgroup/wgscratch.cl b/amd-builtins/workgroup/wgscratch.cl
new file mode 100644
index 0000000..e3db83e
--- /dev/null
+++ b/amd-builtins/workgroup/wgscratch.cl

@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION >= 200
+
+#include "wg.h"
+
+// Temporary data for work group functions
+__local ulong __wg_scratch[MAX_WAVES_PER_SIMD];
+
+#endif
commit	1f4aa749ab83f46a3c3c21b5d6aca2d553750902	[log] [tgz]
author	Tom Stellard <thomas.stellard@amd.com>	Tue Oct 07 17:10:46 2014 +0000
committer	Tom Stellard <thomas.stellard@amd.com>	Tue Oct 07 17:10:46 2014 +0000
tree	da37413ec6c99862fb5852cca5ec5a44b2f30e4b
parent	254d4dc0b9169131222d19c48b3559d53111a780 [diff]