generic/lib/integer/mul_hi.cl - llvm-project/libclc - Git at Google

 #include <clc/clc.h>

 //For all types EXCEPT long, which is implemented separately
 #define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE) \
     _CLC_OVERLOAD _CLC_DEF GENTYPE mul_hi(GENTYPE x, GENTYPE y){ \
         return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \
     } \

 //FOIL-based long mul_hi
 //
 // Summary: Treat mul_hi(long x, long y) as:
 // (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
 // and b and d are the low-order parts of x and y.
 // Thinking back to algebra, we use FOIL to do the work.

 _CLC_OVERLOAD _CLC_DEF long mul_hi(long x, long y){
     long f, o, i;
     ulong l;

     //Move the high/low halves of x/y into the lower 32-bits of variables so
     //that we can multiply them without worrying about overflow.
     long x_hi = x >> 32;
     long x_lo = x & UINT_MAX;
     long y_hi = y >> 32;
     long y_lo = y & UINT_MAX;

     //Multiply all of the components according to FOIL method
     f = x_hi * y_hi;
     o = x_hi * y_lo;
     i = x_lo * y_hi;
     l = x_lo * y_lo;

     //Now add the components back together in the following steps:
     //F: doesn't need to be modified
     //O/I: Need to be added together.
     //L: Shift right by 32-bits, then add into the sum of O and I
     //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
     //
     //We use hadd to give us a bit of extra precision for the intermediate sums
     //but as a result, we shift by 31 bits instead of 32
     return (long)(f + (hadd(o, (i + (long)((ulong)l>>32))) >> 31));
 }

 _CLC_OVERLOAD _CLC_DEF ulong mul_hi(ulong x, ulong y){
     ulong f, o, i;
     ulong l;

     //Move the high/low halves of x/y into the lower 32-bits of variables so
     //that we can multiply them without worrying about overflow.
     ulong x_hi = x >> 32;
     ulong x_lo = x & UINT_MAX;
     ulong y_hi = y >> 32;
     ulong y_lo = y & UINT_MAX;

     //Multiply all of the components according to FOIL method
     f = x_hi * y_hi;
     o = x_hi * y_lo;
     i = x_lo * y_hi;
     l = x_lo * y_lo;

     //Now add the components back together, taking care to respect the fact that:
     //F: doesn't need to be modified
     //O/I: Need to be added together.
     //L: Shift right by 32-bits, then add into the sum of O and I
     //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
     //
     //We use hadd to give us a bit of extra precision for the intermediate sums
     //but as a result, we shift by 31 bits instead of 32
     return (f + (hadd(o, (i + (l>>32))) >> 31));
 }

 #define __CLC_MUL_HI_VEC(GENTYPE) \
     _CLC_OVERLOAD _CLC_DEF GENTYPE##2 mul_hi(GENTYPE##2 x, GENTYPE##2 y){ \
         return (GENTYPE##2){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1)}; \
     } \
     _CLC_OVERLOAD _CLC_DEF GENTYPE##3 mul_hi(GENTYPE##3 x, GENTYPE##3 y){ \
         return (GENTYPE##3){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1), mul_hi(x.s2, y.s2)}; \
     } \
     _CLC_OVERLOAD _CLC_DEF GENTYPE##4 mul_hi(GENTYPE##4 x, GENTYPE##4 y){ \
         return (GENTYPE##4){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
     } \
     _CLC_OVERLOAD _CLC_DEF GENTYPE##8 mul_hi(GENTYPE##8 x, GENTYPE##8 y){ \
         return (GENTYPE##8){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
     } \
     _CLC_OVERLOAD _CLC_DEF GENTYPE##16 mul_hi(GENTYPE##16 x, GENTYPE##16 y){ \
         return (GENTYPE##16){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
     } \

 #define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \
     __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
     __CLC_MUL_HI_VEC(TYPE)

 #define __CLC_MUL_HI_TYPES() \
     __CLC_MUL_HI_DEC_IMPL(short, char, 8) \
     __CLC_MUL_HI_DEC_IMPL(ushort, uchar, 8) \
     __CLC_MUL_HI_DEC_IMPL(int, short, 16) \
     __CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \
     __CLC_MUL_HI_DEC_IMPL(long, int, 32) \
     __CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \
     __CLC_MUL_HI_VEC(long) \
     __CLC_MUL_HI_VEC(ulong)

 __CLC_MUL_HI_TYPES()

 #undef __CLC_MUL_HI_TYPES
 #undef __CLC_MUL_HI_DEC_IMPL
 #undef __CLC_MUL_HI_IMPL
 #undef __CLC_MUL_HI_VEC
 #undef __CLC_B32
	#include <clc/clc.h>

	//For all types EXCEPT long, which is implemented separately
	#define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE) \
	_CLC_OVERLOAD _CLC_DEF GENTYPE mul_hi(GENTYPE x, GENTYPE y){ \
	return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \
	} \

	//FOIL-based long mul_hi
	//
	// Summary: Treat mul_hi(long x, long y) as:
	// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
	// and b and d are the low-order parts of x and y.
	// Thinking back to algebra, we use FOIL to do the work.

	_CLC_OVERLOAD _CLC_DEF long mul_hi(long x, long y){
	long f, o, i;
	ulong l;

	//Move the high/low halves of x/y into the lower 32-bits of variables so
	//that we can multiply them without worrying about overflow.
	long x_hi = x >> 32;
	long x_lo = x & UINT_MAX;
	long y_hi = y >> 32;
	long y_lo = y & UINT_MAX;

	//Multiply all of the components according to FOIL method
	f = x_hi * y_hi;
	o = x_hi * y_lo;
	i = x_lo * y_hi;
	l = x_lo * y_lo;

	//Now add the components back together in the following steps:
	//F: doesn't need to be modified
	//O/I: Need to be added together.
	//L: Shift right by 32-bits, then add into the sum of O and I
	//Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
	//
	//We use hadd to give us a bit of extra precision for the intermediate sums
	//but as a result, we shift by 31 bits instead of 32
	return (long)(f + (hadd(o, (i + (long)((ulong)l>>32))) >> 31));
	}

	_CLC_OVERLOAD _CLC_DEF ulong mul_hi(ulong x, ulong y){
	ulong f, o, i;
	ulong l;

	//Move the high/low halves of x/y into the lower 32-bits of variables so
	//that we can multiply them without worrying about overflow.
	ulong x_hi = x >> 32;
	ulong x_lo = x & UINT_MAX;
	ulong y_hi = y >> 32;
	ulong y_lo = y & UINT_MAX;

	//Multiply all of the components according to FOIL method
	f = x_hi * y_hi;
	o = x_hi * y_lo;
	i = x_lo * y_hi;
	l = x_lo * y_lo;

	//Now add the components back together, taking care to respect the fact that:
	//F: doesn't need to be modified
	//O/I: Need to be added together.
	//L: Shift right by 32-bits, then add into the sum of O and I
	//Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
	//
	//We use hadd to give us a bit of extra precision for the intermediate sums
	//but as a result, we shift by 31 bits instead of 32
	return (f + (hadd(o, (i + (l>>32))) >> 31));
	}

	#define __CLC_MUL_HI_VEC(GENTYPE) \
	_CLC_OVERLOAD _CLC_DEF GENTYPE##2 mul_hi(GENTYPE##2 x, GENTYPE##2 y){ \
	return (GENTYPE##2){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1)}; \
	} \
	_CLC_OVERLOAD _CLC_DEF GENTYPE##3 mul_hi(GENTYPE##3 x, GENTYPE##3 y){ \
	return (GENTYPE##3){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1), mul_hi(x.s2, y.s2)}; \
	} \
	_CLC_OVERLOAD _CLC_DEF GENTYPE##4 mul_hi(GENTYPE##4 x, GENTYPE##4 y){ \
	return (GENTYPE##4){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
	} \
	_CLC_OVERLOAD _CLC_DEF GENTYPE##8 mul_hi(GENTYPE##8 x, GENTYPE##8 y){ \
	return (GENTYPE##8){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
	} \
	_CLC_OVERLOAD _CLC_DEF GENTYPE##16 mul_hi(GENTYPE##16 x, GENTYPE##16 y){ \
	return (GENTYPE##16){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
	} \

	#define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \
	__CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
	__CLC_MUL_HI_VEC(TYPE)

	#define __CLC_MUL_HI_TYPES() \
	__CLC_MUL_HI_DEC_IMPL(short, char, 8) \
	__CLC_MUL_HI_DEC_IMPL(ushort, uchar, 8) \
	__CLC_MUL_HI_DEC_IMPL(int, short, 16) \
	__CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \
	__CLC_MUL_HI_DEC_IMPL(long, int, 32) \
	__CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \
	__CLC_MUL_HI_VEC(long) \
	__CLC_MUL_HI_VEC(ulong)

	__CLC_MUL_HI_TYPES()

	#undef __CLC_MUL_HI_TYPES
	#undef __CLC_MUL_HI_DEC_IMPL
	#undef __CLC_MUL_HI_IMPL
	#undef __CLC_MUL_HI_VEC
	#undef __CLC_B32