From 38df9973505161fc9af1ac69ad04b83fd4ed7fef Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Mon, 16 Jan 2017 18:43:36 -0800 Subject: [PATCH] Squashed commit of the following: commit ec4653659f0a0301da5c8aa533d7592ffdb79329 Author: Boris Fomitchev Date: Mon Jan 16 18:35:45 2017 -0800 cleanup commit b6771be0b8f72f1ff4fba98dd9d88700cb0eeb0d Author: Boris Fomitchev Date: Sun Jan 15 16:03:40 2017 -0800 Enabling half instructions for selected Pascal archs commit 18f5ce19c42a7981782a4c6168d4e435ffa9b20e Author: Boris Fomitchev Date: Sun Jan 15 15:31:38 2017 -0800 cleanup commit 0114d65f649e0eb528bf45691ddd4476c4a4db8b Author: Boris Fomitchev Date: Sun Jan 15 02:16:14 2017 -0800 Build working commit 5b7ed0477dac3f6bf08f0c11487fc95ea36f41f6 Merge: 03a77af 16fb59f Author: Boris Fomitchev Date: Fri Jan 13 18:44:52 2017 -0800 Merge remote-tracking branch 'upstream/master' into spec_refactor Conflicts: CMakeLists.txt Tensor.lua generic/CStorage.c generic/CTensor.c init.c lib/THC/THCCachingAllocator.cpp lib/THC/THCGeneral.c lib/THC/THCGenerateHalfType.h lib/THC/THCHalf.cu lib/THC/THCStorageCopy.c lib/THC/THCStorageCopy.cu lib/THC/THCTensorCopy.c lib/THC/generic/THCStorageCopy.c lib/THC/generic/THCStorageCopy.cu lib/THC/generic/THCStorageCopy.h lib/THC/generic/THCTensorCopy.h test/test_shutdown.lua commit 03a77af912a2b5b8d9478a1237a9d8eaaaa8ec3a Merge: ed0f6b4 2876894 Author: Boris Fomitchev Date: Wed Nov 23 00:59:51 2016 -0800 Merge branch 'android' into spec_refactor commit ed0f6b47bfd693bde55d5d1897de46c1076fcb92 Merge: b572cf9 2d75d41 Author: Boris Fomitchev Date: Mon Nov 21 17:28:04 2016 -0800 Merge branch 'upstream' into spec_refactor commit 287689458df744e707b321bb72d6ff2635948966 Author: Boris Fomitchev Date: Thu Nov 17 03:08:21 2016 -0800 Added C++ flags commit 9328d7dd5ff31b180250a7993ad55e4a0f8e2a51 Merge: 80df9d9 10ef056 Author: Boris Fomitchev Date: Wed Nov 16 21:49:32 2016 -0800 Merge branch 'cpu_half' into android commit 80df9d995043b12d34a9f1864884a229733efc26 Merge: 0afffe1 9a6ba41 Author: Boris Fomitchev Date: Wed Nov 16 21:25:06 2016 -0800 Merge branch 'getmeminfo-fix' commit 9a6ba41117663fcd670a2f2b412383dc5d0aa291 Author: Boris Fomitchev Date: Tue Nov 15 14:48:52 2016 -0800 Bugfix, test extended commit 9aef731946f687e08c2ba4438ab985c433e3044f Author: Boris Fomitchev Date: Tue Nov 15 02:46:49 2016 -0800 Added some memory allocations to test_shutdown to test acching allocator commit 838ec700c17cb9f2b942c8110184fa1eef4c1301 Author: Boris Fomitchev Date: Tue Nov 15 02:29:53 2016 -0800 Implemented cudaMemGetInfo for caching allocator commit b572cf9d86a6442f3b81a8c102e4a8c670611230 Merge: 38882e6 10ef056 Author: Boris Fomitchev Date: Tue Nov 1 13:02:47 2016 -0700 Merge branch 'cpu_half' into spec_refactor commit 10ef05657e6f800127797e620c1ad2ced596d33e Author: Boris Fomitchev Date: Sat Oct 29 15:54:48 2016 -0700 Using half from TH commit 18383174c82fbfd08a9bd8b84639aeec2e4ee22a Author: Boris Fomitchev Date: Fri Oct 28 17:13:04 2016 -0700 checkpoint commit 38882e699756be92d3c241cdf3533ec460e72e54 Merge: acaf1a6 21ad069 Author: Boris Fomitchev Date: Fri Oct 21 00:42:28 2016 -0700 cutorch refactoring commit acaf1a64d4fed30de3fa322926cd1babcfa1cd3b Author: Boris Fomitchev Date: Sun Oct 16 22:27:04 2016 -0700 Checkpoint commit ac964ee15f1cf434bdcb674958abaecc5018e7ba Author: Boris Fomitchev Date: Fri Oct 14 01:03:47 2016 -0700 checkpoint commit 8a75e344ed2a8056317047b5451d5c3451f6469b Merge: a80d28b ada3cbd Author: Boris Fomitchev Date: Wed Oct 12 17:43:51 2016 -0700 Merge remote-tracking branch 'upstream/master' into spec_refactor commit a80d28bbc90b0aae3bd8335d3dae2a8956dcecb6 Author: Boris Fomitchev Date: Wed Oct 12 17:41:43 2016 -0700 checkpoint commit 79a900660fdff195fa9c1d88aa0865aec95ba2b2 Merge: 6933617 9efd392 Author: Boris Fomitchev Date: Mon Oct 10 12:42:21 2016 -0700 Merge remote-tracking branch 'upstream/master' commit 69336174ee3df9d680c8c80a8e30f30bcdcf2025 Merge: b928ca0 afd74e7 Author: Boris Fomitchev Date: Sun Oct 2 21:35:17 2016 -0700 Merge remote-tracking branch 'upstream/master' commit b928ca019ea061cb9ebaca2dd81450aa24f25268 Author: Boris Fomitchev Date: Fri Sep 30 02:03:56 2016 -0700 temporarily force pseudo-fp16 mode (HAS_HALF_INSTRUCTIONS=FALSE) for Pascal: https://github.com/torch/cutorch/issues/520 --- lib/THC/THCHalf.h | 63 +- lib/THC/THCNumerics.cuh | 718 +++++------------------ lib/THC/THCTensorMathPairwise.cu | 222 +------ lib/THC/THCTensorMathPointwise.cuh | 508 +++------------- lib/THC/generic/THCTensorMathPairwise.cu | 2 +- 5 files changed, 331 insertions(+), 1182 deletions(-) diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index 7c055e7a..dbeb4311 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -1,16 +1,21 @@ #ifndef THC_HALF_CONVERSION_INC -#define THC_HALF_CONVERSION_INC +# define THC_HALF_CONVERSION_INC -#include "THCGeneral.h" +#include "cuda.h" +#include "cuda_runtime.h" +#include "cublas_v2.h" +#include "cuda_fp16.h" /* We compile with CudaHalfTensor support if we have this: */ #if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 -#define CUDA_HALF_TENSOR 1 +# define CUDA_HALF_TENSOR 1 #endif #ifdef CUDA_HALF_TENSOR -#include +#include "THCGeneral.h" +#include "THHalf.h" + #include THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len); @@ -24,6 +29,54 @@ THC_API int THC_nativeHalfInstructions(THCState *state); /* Check for performant native fp16 support on the current device */ THC_API int THC_fastHalfInstructions(THCState *state); -#endif /* CUDA_HALF_TENSOR */ + # if defined (__CUDA_ARCH__) +/* use instrintic functons defined for device only in cuda_fp16.h */ +# define THC_FLOAT_TO_HALF(x) __float2half((float)x) +# define THC_HALF_TO_FLOAT(x) __half2float(x) +# define THC_DECL __host__ __device__ __forceinline__ +# else +/* use host conversion functions */ +# define THC_FLOAT_TO_HALF(x) THC_float2half((float)x) +# define THC_HALF_TO_FLOAT(x) THC_half2float(x) +# define THC_DECL inline +# endif +#if __CUDA_ARCH__ == 600 || __CUDA_ARCH__ >= 620 +# define CUDA_HALF_INSTRUCTIONS 1 #endif + +#if defined (__cplusplus__) || defined (__CUDACC__) + +/// `half` has some type conversion issues associated with it, since it +/// is a struct without a constructor/implicit conversion constructor. +/// We use this to convert scalar values to the given type that the +/// tensor expects. + +template +struct ScalarConvert { + static THC_DECL Out to(const In& v) { return Out(v); } +}; + +template +struct ScalarConvert { + static THC_DECL Out to(const half& v) { + return (Out) THC_HALF_TO_FLOAT(v); + } +}; + +template +struct ScalarConvert { + static THC_DECL half to(const In& v) { + return THC_FLOAT_TO_HALF(v); + } +}; + +template <> +struct ScalarConvert { + static THC_DECL const half& to(const half& v) { + return v; + } +}; +# endif /* __cplusplus__ */ +# endif /* CUDA_HALF_TENSOR */ +#endif /* THC_HALF_CONVERSION_INC */ diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh index 09443605..6b504e77 100644 --- a/lib/THC/THCNumerics.cuh +++ b/lib/THC/THCNumerics.cuh @@ -1,631 +1,235 @@ #ifndef THC_NUMERICS_INC #define THC_NUMERICS_INC -#include -#include -#include "THCHalf.h" - -/// Class for numeric limits of the particular data type, which -/// includes support for `half`. -/// Unfortunately since `half` does not have a constructor, these have -/// to be expressed as functions (either that or non-const statics). + +#include +#include "THCGeneral.h" + template -struct THCNumerics { +struct THCNumConstants +{ + static THC_DECL const T one() { return T(1); } + static THC_DECL const T zero() { return T(0); } + }; template <> -struct THCNumerics { - static inline __host__ __device__ unsigned char min() { return 0; } - static inline __host__ __device__ unsigned char max() { return UCHAR_MAX; } - - static inline __host__ __device__ bool lt(unsigned char a, unsigned char b) { return a < b; } - static inline __host__ __device__ bool le(unsigned char a, unsigned char b) { return a <= b; } - static inline __host__ __device__ bool gt(unsigned char a, unsigned char b) { return a > b; } - static inline __host__ __device__ bool ge(unsigned char a, unsigned char b) { return a >= b; } - static inline __host__ __device__ bool eq(unsigned char a, unsigned char b) { return a == b; } - static inline __host__ __device__ bool ne(unsigned char a, unsigned char b) { return a != b; } - - static inline __host__ __device__ unsigned char add(unsigned char a, unsigned char b) { return a + b; } - static inline __host__ __device__ unsigned char mul(unsigned char a, unsigned char b) { return a * b; } - static inline __host__ __device__ unsigned char sub(unsigned char a, unsigned char b) { return a - b; } - static inline __host__ __device__ unsigned char div(unsigned char a, unsigned char b) { return a / b; } - static inline __host__ __device__ unsigned char abs(unsigned char a) { return abs(a); } +struct THCNumConstants +{ + static THC_DECL const half one() { half ret = THC_FLOAT_TO_HALF(1.f); return ret;} /* TODO: use literal */ + static THC_DECL const half zero() { half ret; ret.x = 0; return ret;} }; -template <> -struct THCNumerics { - static inline __host__ __device__ char min() { return CHAR_MIN; } - static inline __host__ __device__ char max() { return CHAR_MAX; } - - static inline __host__ __device__ bool lt(char a, char b) { return a < b; } - static inline __host__ __device__ bool le(char a, char b) { return a <= b; } - static inline __host__ __device__ bool gt(char a, char b) { return a > b; } - static inline __host__ __device__ bool ge(char a, char b) { return a >= b; } - static inline __host__ __device__ bool eq(char a, char b) { return a == b; } - static inline __host__ __device__ bool ne(char a, char b) { return a != b; } - - static inline __host__ __device__ char add(char a, char b) { return a + b; } - static inline __host__ __device__ char mul(char a, char b) { return a * b; } - static inline __host__ __device__ char sub(char a, char b) { return a - b; } - static inline __host__ __device__ char div(char a, char b) { return a / b; } - static inline __host__ __device__ char abs(char a) { return abs(a); } +template +struct THCNumCommonBase { + typedef T storage_type; + /* type value should be converted to before doing math on it. + For most types except 16-bit floats, MathType==StorageType. + */ + typedef M math_type; + + /* type of math operation result , like (a*b). Usually == StorageType */ + typedef T expr_type; + + static THC_DECL math_type m_(const storage_type& a) { + return ScalarConvert::to(a); + } + static THC_DECL expr_type e_(const math_type& a) { + return ScalarConvert::to(a); + } + static THC_DECL storage_type s_(const expr_type& a) { + return ScalarConvert::to(a); + } + + static THC_DECL bool lt(const storage_type& a, const storage_type& b) { return m_(a) < m_(b); } + static THC_DECL bool le(const storage_type& a, const storage_type& b) { return m_(a) <= m_(b); } + static THC_DECL bool gt(const storage_type& a, const storage_type& b) { return m_(a) > m_(b); } + static THC_DECL bool ge(const storage_type& a, const storage_type& b) { return m_(a) >= m_(b); } + static THC_DECL bool eq(const storage_type& a, const storage_type& b) { return m_(a) == m_(b); } + static THC_DECL bool ne(const storage_type& a, const storage_type& b) { return m_(a) != m_(b); } + + static THC_DECL expr_type add(const storage_type& a, const storage_type& b) { return e_(m_(a) + m_(b)); } + static THC_DECL expr_type mul(const storage_type& a, const storage_type& b) { return e_(m_(a) * m_(b)); } + static THC_DECL expr_type sub(const storage_type& a, const storage_type& b) { return e_(m_(a) - m_(b)); } + static THC_DECL expr_type div(const storage_type& a, const storage_type& b) { return e_(m_(a) / m_(b)); } + static THC_DECL expr_type abs(const storage_type& a) { bool isneg = (a<0); return e_(isneg ? -a : a); } + static THC_DECL expr_type neg(const storage_type& a) { return e_(-m_(a)); } + static THC_DECL expr_type pow (const storage_type& a, T b) { return e_(::pow((double)a, (double)b)); } + static THC_DECL expr_type mod(const storage_type& a, const storage_type& b) { return e_(m_(a) % m_(b)); } + static THC_DECL const T min() { return std::numeric_limits::min(); } + static THC_DECL const T max() { return std::numeric_limits::max(); } }; -template <> -struct THCNumerics { - static inline __host__ __device__ short min() { return SHRT_MIN; } - static inline __host__ __device__ short max() { return SHRT_MAX; } - - static inline __host__ __device__ bool lt(short a, short b) { return a < b; } - static inline __host__ __device__ bool le(short a, short b) { return a <= b; } - static inline __host__ __device__ bool gt(short a, short b) { return a > b; } - static inline __host__ __device__ bool ge(short a, short b) { return a >= b; } - static inline __host__ __device__ bool eq(short a, short b) { return a == b; } - static inline __host__ __device__ bool ne(short a, short b) { return a != b; } - - static inline __host__ __device__ short add(short a, short b) { return a + b; } - static inline __host__ __device__ short mul(short a, short b) { return a * b; } - static inline __host__ __device__ short sub(short a, short b) { return a - b; } - static inline __host__ __device__ short div(short a, short b) { return a / b; } - static inline __host__ __device__ short abs(short a) { return abs(a); } +template +struct THCNumBase {}; + +template +struct THCNumBase : public THCNumCommonBase { }; template <> -struct THCNumerics { - static inline __host__ __device__ int min() { return INT_MIN; } - static inline __host__ __device__ int max() { return INT_MAX; } - - static inline __host__ __device__ bool lt(int a, int b) { return a < b; } - static inline __host__ __device__ bool le(int a, int b) { return a <= b; } - static inline __host__ __device__ bool gt(int a, int b) { return a > b; } - static inline __host__ __device__ bool ge(int a, int b) { return a >= b; } - static inline __host__ __device__ bool eq(int a, int b) { return a == b; } - static inline __host__ __device__ bool ne(int a, int b) { return a != b; } - - static inline __host__ __device__ int add(int a, int b) { return a + b; } - static inline __host__ __device__ int mul(int a, int b) { return a * b; } - static inline __host__ __device__ int sub(int a, int b) { return a - b; } - static inline __host__ __device__ int div(int a, int b) { return a / b; } - static inline __host__ __device__ int abs(int a) { return ::abs(a); } +struct THCNumBase : public THCNumCommonBase { + static THC_DECL expr_type abs(const storage_type& a) { return labs(a); } }; -template <> -struct THCNumerics { - static inline __host__ __device__ long min() { return LONG_MIN; } - static inline __host__ __device__ long max() { return LONG_MAX; } - - static inline __host__ __device__ bool lt(long a, long b) { return a < b; } - static inline __host__ __device__ bool le(long a, long b) { return a <= b; } - static inline __host__ __device__ bool gt(long a, long b) { return a > b; } - static inline __host__ __device__ bool ge(long a, long b) { return a >= b; } - static inline __host__ __device__ bool eq(long a, long b) { return a == b; } - static inline __host__ __device__ bool ne(long a, long b) { return a != b; } - - static inline __host__ __device__ long add(long a, long b) { return a + b; } - static inline __host__ __device__ long mul(long a, long b) { return a * b; } - static inline __host__ __device__ long sub(long a, long b) { return a - b; } - static inline __host__ __device__ long div(long a, long b) { return a / b; }; - static inline __host__ __device__ long abs(long a) { return labs(a); } +template +struct THCNumBase : public THCNumCommonBase { + typedef THCNumCommonBase Base; + using Base::e_; + using Base::m_; + using Base::s_; + using typename Base::math_type; + using typename Base::expr_type; + using typename Base::storage_type; + + static THC_DECL const T min() { return -std::numeric_limits::max(); } + static THC_DECL const T max() { return std::numeric_limits::max(); } + + + static THC_DECL expr_type exp (const storage_type& a) { return e_(::exp(m_(a))); } + static THC_DECL expr_type log (const storage_type& a) { return e_(::log(m_(a))); } + static THC_DECL expr_type log1p(const storage_type& a) { return e_(::log1p(m_(a))); } + static THC_DECL expr_type cos (const storage_type& a) { return e_(::cos(m_(a))); } + static THC_DECL expr_type sin (const storage_type& a) { return e_(::sin(m_(a))); } + static THC_DECL expr_type sqrt (const storage_type& a) { return e_(::sqrt(m_(a))); } + static THC_DECL expr_type rsqrt(const storage_type& a) { return e_(::rsqrt(m_(a))); } + static THC_DECL expr_type ceil (const storage_type& a) { return e_(::ceil(m_(a))); } + static THC_DECL expr_type floor(const storage_type& a) { return e_(::floor(m_(a))); } + static THC_DECL expr_type trunc(const storage_type& a) { return e_(::trunc(m_(a))); } + static THC_DECL expr_type acos (const storage_type& a) { return e_(::acos(m_(a))); } + static THC_DECL expr_type cosh (const storage_type& a) { return e_(::cosh(m_(a))); } + static THC_DECL expr_type acosh(const storage_type& a) { return e_(::acosh(m_(a))); } + static THC_DECL expr_type asin (const storage_type& a) { return e_(::asin(m_(a))); } + static THC_DECL expr_type sinh (const storage_type& a) { return e_(::sinh(m_(a))); } + static THC_DECL expr_type asinh(const storage_type& a) { return e_(::asinh(m_(a))); } + static THC_DECL expr_type tan (const storage_type& a) { return e_(::tan(m_(a))); } + static THC_DECL expr_type atan (const storage_type& a) { return e_(::atan(m_(a))); } + static THC_DECL expr_type tanh (const storage_type& a) { return e_(::tanh(m_(a))); } + static THC_DECL expr_type abs (const storage_type& a) { return e_(::abs(m_(a))); } + static THC_DECL expr_type round(const storage_type& a) { return e_(::round(m_(a))); } + static THC_DECL expr_type frac (const storage_type& a) { return e_(m_(a) - ::trunc(m_(a))); } + static THC_DECL expr_type cinv (const storage_type& a) { return Base::div(THCNumConstants::one(), a); } + static THC_DECL expr_type pow (const storage_type& a, T b) { return e_(::pow(m_(a), m_(b))); } + static THC_DECL expr_type mod (const storage_type& a, const storage_type& b) { return e_(::fmod(m_(a), m_(b))); } + +}; + +template +struct THCNumerics: public THCNumBase::is_integer> { + typedef THCNumCommonBase Base; + using typename Base::math_type; + using typename Base::expr_type; + using typename Base::storage_type; + typedef THCNumConstants Constants; }; #ifdef CUDA_HALF_TENSOR -template <> -struct THCNumerics { - static inline __host__ __device__ half min() { half h; h.x = 0xfbff; return h; } - static inline __host__ __device__ half max() { half h; h.x = 0x7bff; return h; } - static inline __host__ __device__ bool lt(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS +template <> +struct THCNumerics: public THCNumBase { + static THC_DECL const half min() { half ret; ret.x = 0xFBFF; return ret; } + static THC_DECL const half max() { half ret; ret.x = 0x7BFF; return ret; } + typedef THCNumConstants Constants; + +# ifndef CUDA_HALF_INSTRUCTIONS + typedef THCNumCommonBase Base; + using typename Base::math_type; + using typename Base::expr_type; + using typename Base::storage_type; + using Base::e_; + using Base::m_; + using Base::s_; +# else + typedef THCNumCommonBase Base; + typedef typename Base::storage_type storage_type; + typedef typename Base::math_type math_type; + typedef typename Base::expr_type expr_type; + static THC_DECL math_type m_(const storage_type& a) { + return ScalarConvert::to(a); + } + static THC_DECL expr_type e_(const math_type& a) { + return ScalarConvert::to(a); + } + static THC_DECL storage_type s_(const expr_type& a) { + return ScalarConvert::to(a); + } + + static THC_DECL bool lt(const half& a, const half& b) { return __hlt(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa < fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) < THC_half2float(b); -#endif } - - static inline __host__ __device__ bool le(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL bool le(const half& a, const half& b) { return __hle(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa <= fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) <= THC_half2float(b); -#endif } - static inline __host__ __device__ bool gt(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL bool gt(const half& a, const half& b) { return __hgt(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa > fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) > THC_half2float(b); -#endif } - static inline __host__ __device__ bool ge(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL bool ge(const half& a, const half& b) { return __hge(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa >= fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) >= THC_half2float(b); -#endif } - static inline __host__ __device__ bool eq(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL bool eq(const half& a, const half& b) { return __heq(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa == fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) == THC_half2float(b); -#endif } - static inline __host__ __device__ bool ne(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL bool ne(const half& a, const half& b) { return __hne(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa != fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) != THC_half2float(b); -#endif } - - static inline __host__ __device__ half exp(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half exp(const half& a) { return hexp(a); -#else - float fa = __half2float(a); - return __float2half(expf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(expf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half log(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half log(const half& a) { return hlog(a); -#else - float fa = __half2float(a); - return __float2half(logf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(logf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half log1p(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(log1pf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(log1pf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half cos(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half cos(const half& a) { return hcos(a); -#else - float fa = __half2float(a); - return __float2half(cosf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(cosf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half sin(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half sin(const half& a) { return hsin(a); -#else - float fa = __half2float(a); - return __float2half(sinf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(sinf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half sqrt(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half sqrt(const half& a) { return hsqrt(a); -#else - float fa = __half2float(a); - return __float2half(sqrtf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(sqrtf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half rsqrt(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half rsqrt(const half& a) { return hrsqrt(a); -#else - float fa = __half2float(a); - return __float2half(rsqrtf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(rsqrtf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half ceil(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half ceil(const half& a) { return hceil(a); -#else - float fa = __half2float(a); - return __float2half(ceilf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(ceilf(THC_half2float(a))); -#endif } - static inline __host__ __device__ half floor(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half floor(const half& a) { return hfloor(a); -#else - float fa = __half2float(a); - return __float2half(floorf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(floorf(THC_half2float(a))); -#endif } - static inline __host__ __device__ half trunc(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half trunc(const half& a) { return htrunc(a); -#else - float fa = __half2float(a); - return __float2half(truncf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(truncf(THC_half2float(a))); -#endif } - static inline __host__ __device__ half neg(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half neg(const half& a) { return __hneg(a); -#else - float fa = __half2float(a); - return __float2half(-fa); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(-(THC_half2float(a))); -#endif } - static inline __host__ __device__ half acos(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(acosf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(acosf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half cosh(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(coshf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(coshf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half asin(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(asinf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(asinf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half sinh(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(sinhf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(sinhf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half tan(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(tanf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(tanf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half atan(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(atanf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(atanf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half tanh(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(tanhf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(tanhf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half abs(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(fabs(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(fabs(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half round(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(roundf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(roundf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half frac(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(fa - truncf(fa)); -#else // __CUDA_ARCH__ - float fa = THC_half2float(a); - return THC_float2half(fa - floorf(fa)); -#endif - } - - static inline __host__ __device__ half cinv(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(1.0f / fa); -#else // __CUDA_ARCH__ - return THC_float2half(1.0f / THC_half2float(a)); -#endif - } - - static inline __host__ __device__ half add(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL const half& add(const half& a, const half& b) { return __hadd(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return __float2half( fa + fb ); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(THC_half2float(a) + THC_half2float(b)); -#endif } - - static inline __host__ __device__ half div(half a, half b) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - float fb = __half2float(b); - return __float2half( fa / fb ); -#else // __CUDA_ARCH__ - return THC_float2half(THC_half2float(a) / THC_half2float(b)); -#endif - } - - static inline __host__ __device__ half mul(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half mul(const half& a, const half& b) { return __hmul(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return __float2half( fa * fb ); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(THC_half2float(a) * THC_half2float(b)); -#endif } - static inline __host__ __device__ half sub(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static THC_DECL half sub(const half& a, const half& b) { return __hsub(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return __float2half( fa - fb ); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(THC_half2float(a) - THC_half2float(b)); -#endif } - static inline __host__ __device__ half pow(half a, half b) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - float fb = __half2float(b); - return __float2half(powf(fa, fb)); -#else // __CUDA_ARCH__ - return THC_float2half(powf(THC_half2float(a), THC_half2float(b))); -#endif - } - -}; -#endif - -template <> -struct THCNumerics { - static inline __host__ __device__ float min() { return -FLT_MAX; } - static inline __host__ __device__ float max() { return FLT_MAX; } - - static inline __host__ __device__ bool lt(float a, float b) { return a < b; } - static inline __host__ __device__ bool le(float a, float b) { return a <= b; } - static inline __host__ __device__ bool gt(float a, float b) { return a > b; } - static inline __host__ __device__ bool ge(float a, float b) { return a >= b; } - static inline __host__ __device__ bool eq(float a, float b) { return a == b; } - static inline __host__ __device__ bool ne(float a, float b) { return a != b; } - - static inline __host__ __device__ float exp (float a) { return expf(a); } - static inline __host__ __device__ float log (float a) { return logf(a); } - static inline __host__ __device__ float log1p(float a) { return log1pf(a); } - static inline __host__ __device__ float cos (float a) { return cosf(a); } - static inline __host__ __device__ float sin (float a) { return sinf(a); } - static inline __host__ __device__ float sqrt (float a) { return sqrtf(a); } - static inline __host__ __device__ float rsqrt(float a) { return rsqrtf(a); } - static inline __host__ __device__ float ceil (float a) { return ceilf(a); } - static inline __host__ __device__ float floor(float a) { return floorf(a); } - static inline __host__ __device__ float trunc(float a) { return truncf(a); } - static inline __host__ __device__ float neg (float a) { return -a; } - static inline __host__ __device__ float acos (float a) { return acosf(a); } - static inline __host__ __device__ float cosh (float a) { return coshf(a); } - static inline __host__ __device__ float acosh(float a) { return acoshf(a); } - static inline __host__ __device__ float asin (float a) { return asinf(a); } - static inline __host__ __device__ float sinh (float a) { return sinhf(a); } - static inline __host__ __device__ float asinh(float a) { return asinhf(a); } - static inline __host__ __device__ float tan (float a) { return tanf(a); } - static inline __host__ __device__ float atan (float a) { return atanf(a); } - static inline __host__ __device__ float tanh (float a) { return tanhf(a); } - static inline __host__ __device__ float abs (float a) { return fabs(a); } - static inline __host__ __device__ float round(float a) { return roundf(a); } - static inline __host__ __device__ float frac (float a) { return a - truncf(a); } - static inline __host__ __device__ float cinv (float a) { return 1.0f / a; } - static inline __host__ __device__ float add (float a, float b) { return a + b; } - static inline __host__ __device__ float div (float a, float b) { return a / b; } - static inline __host__ __device__ float mul (float a, float b) { return a * b; } - static inline __host__ __device__ float sub (float a, float b) { return a - b; } - static inline __host__ __device__ float pow (float a, float b) { return powf(a, b); } -}; - -template <> -struct THCNumerics { - static inline __host__ __device__ double min() { return -DBL_MAX; } - static inline __host__ __device__ double max() { return DBL_MAX; } - - static inline __host__ __device__ bool lt(double a, double b) { return a < b; } - static inline __host__ __device__ bool le(double a, double b) { return a <= b; } - static inline __host__ __device__ bool gt(double a, double b) { return a > b; } - static inline __host__ __device__ bool ge(double a, double b) { return a >= b; } - static inline __host__ __device__ bool eq(double a, double b) { return a == b; } - static inline __host__ __device__ bool ne(double a, double b) { return a != b; } - - static inline __host__ __device__ double exp (double a) { return ::exp(a); } - static inline __host__ __device__ double log (double a) { return ::log(a); } - static inline __host__ __device__ double log1p(double a) { return ::log1p(a); } - static inline __host__ __device__ double cos (double a) { return ::cos(a); } - static inline __host__ __device__ double sin (double a) { return ::sin(a); } - static inline __host__ __device__ double sqrt (double a) { return ::sqrt(a); } - static inline __host__ __device__ double rsqrt(double a) { return ::rsqrt(a); } - static inline __host__ __device__ double ceil (double a) { return ::ceil(a); } - static inline __host__ __device__ double floor(double a) { return ::floor(a); } - static inline __host__ __device__ double trunc(double a) { return ::trunc(a); } - static inline __host__ __device__ double neg (double a) { return -a; } - static inline __host__ __device__ double acos (double a) { return ::acos(a); } - static inline __host__ __device__ double cosh (double a) { return ::cosh(a); } - static inline __host__ __device__ double acosh(double a) { return ::acosh(a); } - static inline __host__ __device__ double asin (double a) { return ::asin(a); } - static inline __host__ __device__ double sinh (double a) { return ::sinh(a); } - static inline __host__ __device__ double asinh(double a) { return ::asinh(a); } - static inline __host__ __device__ double tan (double a) { return ::tan(a); } - static inline __host__ __device__ double atan (double a) { return ::atan(a); } - static inline __host__ __device__ double tanh (double a) { return ::tanh(a); } - static inline __host__ __device__ double abs (double a) { return ::abs(a); } - static inline __host__ __device__ double round(double a) { return ::round(a); } - static inline __host__ __device__ double frac (double a) { return a - ::trunc(a); } - static inline __host__ __device__ double cinv (double a) { return 1.0 / a; } - static inline __host__ __device__ double add (double a, double b) { return a + b; } - static inline __host__ __device__ double div (double a, double b) { return a / b; } - static inline __host__ __device__ double mul (double a, double b) { return a * b; } - static inline __host__ __device__ double sub (double a, double b) { return a - b; } - static inline __host__ __device__ double pow (double a, double b) { return ::pow(a, b); } -}; - -/// `half` has some type conversion issues associated with it, since it -/// is a struct without a constructor/implicit conversion constructor. -/// We use this to convert scalar values to the given type that the -/// tensor expects. -template -struct ScalarConvert { - static __host__ __device__ Out to(const In v) { return (Out) v; } -}; - -#ifdef CUDA_HALF_TENSOR -template -struct ScalarConvert { - static __host__ __device__ Out to(const half v) { -#ifdef __CUDA_ARCH__ - return (Out) __half2float(v); -#else - return (Out) THC_half2float(v); -#endif + static THC_DECL half div (const half& a, const half& b) { + return hdiv(a,b); } -}; - -template -struct ScalarConvert { - static __host__ __device__ half to(const In v) { -#ifdef __CUDA_ARCH__ - return __float2half((float) v); -#else - return THC_float2half((float) v); -#endif + static THC_DECL half mod (const half& a, const half& b) { + return __float2half(fmodf(__half2float(a), __half2float(b))); } +# endif }; -template <> -struct ScalarConvert { - static __host__ __device__ half to(const half v) { - return v; - } -}; #endif -#endif // THC_NUMERICS_INC +#endif // THC_NUMERICS_INC diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu index 094cf0b8..1511fb88 100644 --- a/lib/THC/THCTensorMathPairwise.cu +++ b/lib/THC/THCTensorMathPairwise.cu @@ -1,6 +1,5 @@ -#include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "THCTensorMath.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCNumerics.cuh" @@ -8,238 +7,57 @@ template struct TensorAddConstantOp { + typedef THCNumerics N_; TensorAddConstantOp(T v) : val(v) {} __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *in + val; + *out = N_::s_(N_::add(*in, val)); } - __device__ __forceinline__ void operator()(T* v) { - *v += val; + this->operator()(v, v); } - - const T val; + const typename N_::storage_type val; }; -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorAddConstantOp { -#ifdef CUDA_HALF_INSTRUCTIONS - TensorAddConstantOp(half v) : val(v) {} -#else - TensorAddConstantOp(half v) : fval(THC_half2float(v)) {} -#endif - - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*in, val); -#else - float fin = __half2float(*in); - float fout = fin + fval; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* v) { -#ifdef CUDA_HALF_INSTRUCTIONS - *v = __hadd(*v, val); -#else - float fv = __half2float(*v); - fv += fval; - *v = __float2half(fv); -#endif - } - -#ifdef CUDA_HALF_INSTRUCTIONS - const half val; -#else - const float fval; -#endif -}; -#endif // CUDA_HALF_TENSOR - - template struct TensorSubConstantOp { + typedef THCNumerics N_; TensorSubConstantOp(T v) : val(v) {} __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *in - val; + *out = N_::s_(N_::sub(*in, val)); } - __device__ __forceinline__ void operator()(T* v) { - *v -= val; - } - - const T val; -}; - - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorSubConstantOp { -#ifdef CUDA_HALF_INSTRUCTIONS - TensorSubConstantOp(half v): val(THC_float2half(-(THC_half2float(v)))) {} -#else - TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {} -#endif - - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*in, val); -#else - float fin = __half2float(*in); - float fout = fin + fval; - *out = __float2half(fout); -#endif + this->operator()(v, v); } - - __device__ __forceinline__ void operator()(half* v) { -#ifdef CUDA_HALF_INSTRUCTIONS - *v = __hadd(*v, val); -#else - float fv = __half2float(*v); - fv += fval; - *v = __float2half(fv); -#endif - } - -#ifdef CUDA_HALF_INSTRUCTIONS - const half val; -#else - const float fval; -#endif + const typename N_::storage_type val; }; -#endif // CUDA_HALF_TENSOR template struct TensorMulConstantOp { TensorMulConstantOp(T v) : val(v) {} + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *in * val; + *out = N_::s_(N_::mul(*in, val)); } - __device__ __forceinline__ void operator()(T* v) { - *v *= val; + this->operator()(v, v); } - - const T val; + const typename N_::storage_type val; }; -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorMulConstantOp { -#ifdef CUDA_HALF_INSTRUCTIONS - TensorMulConstantOp(half v) : val(v) {} -#else - TensorMulConstantOp(half v) : fval(THC_half2float(v)) {} -#endif - - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hmul(*in, val); -#else - float fin = __half2float(*in); - float fout = fin * fval; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* v) { -#ifdef CUDA_HALF_INSTRUCTIONS - *v = __hmul(*v, val); -#else - float fv = __half2float(*v); - fv *= fval; - *v = __float2half(fv); -#endif - } - -#ifdef CUDA_HALF_INSTRUCTIONS - const half val; -#else - const float fval; -#endif -}; -#endif // CUDA_HALF_TENSOR - template struct TensorDivConstantOp { - TensorDivConstantOp(T v) : val(v) {} + typedef THCNumerics N_; + TensorDivConstantOp(const T& v) : val(N_::div(N_::Constants::one(), v)) {} __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *in / val; + *out = N_::s_(N_::div(*in, val)); } - __device__ __forceinline__ void operator()(T* v) { - *v /= val; + this->operator()(v, v); } - - const T val; + const typename N_::storage_type val; }; -template <> -struct TensorDivConstantOp { - TensorDivConstantOp(float v) : val(1.f / v) {} - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = *in * val; - } - - __device__ __forceinline__ void operator()(float* v) { - *v *= val; - } - - const float val; -}; - -template <> -struct TensorDivConstantOp { - TensorDivConstantOp(double v) : val(1. / v) {} - __device__ __forceinline__ void operator()(double* out, double* in) { - *out = *in * val; - } - - __device__ __forceinline__ void operator()(double* v) { - *v *= val; - } - - const double val; -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorDivConstantOp { -#ifdef CUDA_HALF_INSTRUCTIONS - TensorDivConstantOp(half v) : val(ScalarInv::to(v)) {} -#else - TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {} -#endif - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hmul(*in, val); -#else - float fin = __half2float(*in); - float fout = fin * fval; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* v) { -#ifdef CUDA_HALF_INSTRUCTIONS - *v = __hmul(*v, val); -#else - float fv = __half2float(*v); - fv *= fval; - *v = __float2half(fv); -#endif - } - -#ifdef CUDA_HALF_INSTRUCTIONS - const half val; -#else - const float fval; -#endif -}; -#endif // CUDA_HALF_TENSOR - template struct TensorRemainderOp { TensorRemainderOp(T v) : val(v) {} @@ -293,7 +111,7 @@ struct TensorRemainderOp { __device__ __forceinline__ void operator()(half* out, half* in) { #ifdef CUDA_HALF_INSTRUCTIONS - *out = __hsub(*in, __hmul(val, hfloor(__hdiv(*in, val)))); + *out = __hsub(*in, __hmul(val, hfloor(hdiv(*in, val)))); #else float fin = __half2float(*in); float fout = fin - fval * floorf(fin / fval); @@ -303,7 +121,7 @@ struct TensorRemainderOp { __device__ __forceinline__ void operator()(half* v) { #ifdef CUDA_HALF_INSTRUCTIONS - *v = __hsub(*v, __hmul(val, hfloor(__hdiv(*v, val)))); + *v = __hsub(*v, __hmul(val, hfloor(hdiv(*v, val)))); #else float fv = __half2float(*v); fv = fv - fval * floorf(fv / fval); diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh index de96cadd..f9499c5e 100644 --- a/lib/THC/THCTensorMathPointwise.cuh +++ b/lib/THC/THCTensorMathPointwise.cuh @@ -1,9 +1,8 @@ #ifndef THC_TENSORMATH_POINTWISE_CUH #define THC_TENSORMATH_POINTWISE_CUH -#include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "THCTensorMath.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCNumerics.cuh" @@ -11,313 +10,96 @@ template struct TensorSigmoidOp { + typedef THCNumerics N_; + typedef typename N_::Constants NC_; __device__ __forceinline__ void operator()(T* out, T* in) const { - T one = (T) 1.0; - *out = one / (one + THCNumerics::exp(- *in)); + *out = N_::div(NC_::one(), N_::add(NC_::one(), N_::exp(N_::neg(*in)))); } - __device__ __forceinline__ void operator()(T* v) const { - T one = (T) 1.0; - *v = one / (one + THCNumerics::exp(- *v)); + this->operator()(v, v); } }; -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorSigmoidOp { - __device__ __forceinline__ void operator()(half* out, half* in) const { -#ifdef CUDA_HALF_INSTRUCTIONS - half one = ScalarConvert::to(1); - *out = hdiv(one, __hadd(one, hexp(__hneg(*in)))); -#else - float fin = __half2float(*in); - *out = __float2half(1.0f / (1.0f + expf(- fin))); -#endif - } - - __device__ __forceinline__ void operator()(half* v) const { -#ifdef CUDA_HALF_INSTRUCTIONS - half one = ScalarConvert::to(1); - *v = hdiv(one, __hadd(one, hexp(__hneg(*v)))); -#else - float fv = __half2float(*v); - *v = __float2half(1.0f / (1.0f + expf(- fv))); -#endif - } -}; -#endif - template struct TensorSignOp { + typedef THCNumerics N_; + typedef THCNumConstants NC_; + __device__ __forceinline__ void operator()(T* out, T* in) { - T orig = *in; - *out = (orig > 0) - (orig < 0); + const T& orig = *in; + *out = (N_::gt(orig, NC_::zero()) ? NC_::one() : + N_::lt(orig, NC_::zero()) ? N_::neg(NC_::one()) : + NC_::zero()); } - __device__ __forceinline__ void operator()(T* v) { - T orig = *v; - *v = (orig > 0) - (orig < 0); + this->operator()(v, v); } }; -template <> -struct TensorSignOp { - __device__ __forceinline__ void operator()(unsigned char* out, unsigned char* in) { - unsigned char orig = *in; - *out = (orig == 0) ? 0 : 1; - } - - __device__ __forceinline__ void operator()(unsigned char* v) { - unsigned char orig = *v; - *v = (orig == 0) ? 0 : 1; - } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorSignOp { - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - half zero = ScalarConvert::to(0); - half orig = *in; - *out = __float2half((float) __hgt(orig, zero) - (float) __hlt(orig, zero)); -#else - float orig = __half2float(*in); - *out = __float2half((orig > 0) - (orig < 0)); -#endif - } - - __device__ __forceinline__ void operator()(half* v) { -#ifdef CUDA_HALF_INSTRUCTIONS - half zero = ScalarConvert::to(0); - half orig = *v; - *v = __float2half((float) __hgt(orig, zero) - (float) __hlt(orig, zero)); -#else - float orig = __half2float(*v); - *v = __float2half((orig > 0) - (orig < 0)); -#endif - } -}; -#endif - template struct TensorAddOp { - __device__ __forceinline__ void operator()(T* out, T* in) { - *out += *in; - } - + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = *in1 + *in2; + *out = N_::s_(N_::add(*in1, *in2)); } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorAddOp { - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*out, *in); -#else - float fout = __half2float(*out); - float fin = __half2float(*in); - fout += fin; - *out = __float2half(fout); -#endif + __device__ __forceinline__ void operator()(T* out, T* in) { + this->operator()(out, in, out); } - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*in1, *in2); -#else - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = fin1 + fin2; - *out = __float2half(fout); -#endif - } }; -#endif // CUDA_HALF_TENSOR + template struct TensorCAddOp { + typedef THCNumerics N_; TensorCAddOp(T v) : val(v) {} - __device__ __forceinline__ void operator()(T* out, T* in) { - *out += val * *in; + this->operator()(out, out, in); } - __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = *in1 + val * *in2; + *out = N_::s_(N_::add(*in1, N_::mul(val, *in2))); } - T val; + const typename N_::storage_type val; }; -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorCAddOp { - TensorCAddOp(half v) : val(v) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*out, __hmul(val, *in)); -#else - float fout = __half2float(*out); - float fval = __half2float(val); - float fin = __half2float(*in); - - fout += fval * fin; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*in1, __hmul(val, *in2)); -#else - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fval = __half2float(val); - - float fout = fin1 + fval * fin2; - *out = __float2half(fout); -#endif - } - - half val; -}; -#endif // CUDA_HALF_TENSOR template struct TensorSubOp { - __device__ __forceinline__ void operator()(T* out, T* in) { - *out -= *in; - } - + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = *in1 - *in2; + *out = N_::s_(N_::sub(*in1, *in2)); } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorSubOp { - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hsub(*out, *in); -#else - float fout = __half2float(*out); - float fin = __half2float(*in); - fout -= fin; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hsub(*in1, *in2); -#else - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = fin1 - fin2; - *out = __float2half(fout); -#endif + __device__ __forceinline__ void operator()(T* out, T* in) { + this->operator()(out, out, in); } }; -#endif // CUDA_HALF_TENSOR template struct TensorMulOp { - __device__ __forceinline__ void operator()(T* out, T* in) { - *out *= *in; - } - + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = *in1 * *in2; + *out = N_::s_(N_::mul(*in1, *in2)); } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorMulOp { - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hmul(*out, *in); -#else - float fout = __half2float(*out); - float fin = __half2float(*in); - fout *= fin; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hmul(*in1, *in2); -#else - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = fin1 * fin2; - *out = __float2half(fout); -#endif + __device__ __forceinline__ void operator()(T* out, T* in) { + this->operator()(out, out, in); } }; -#endif // CUDA_HALF_TENSOR template struct TensorPowOp { - TensorPowOp(T v) : val(v) {} + typedef THCNumerics N_; + TensorPowOp(T v) : val(N_::s_(v)) {} __device__ __forceinline__ void operator()(T* out, T* in) { - *out = powf((float) *in, (float) val); + *out = N_::s_(N_::pow(*in, val)); } __device__ __forceinline__ void operator()(T* v) { - *v = powf((float) *v, (float) val); + this->operator()(v, v); } - - const T val; + const typename N_::storage_type val; }; -template <> -struct TensorPowOp { - TensorPowOp(double v) : val(v) {} - - __device__ __forceinline__ void operator()(double* out, double* in) { - *out = pow(*in, val); - } - - __device__ __forceinline__ void operator()(double* v) { - *v = pow(*v, val); - } - - const double val; -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorPowOp { - TensorPowOp(half v) : val(v) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - // No fp16 pow function yet - float fin = __half2float(*in); - float fval = __half2float(val); - float fout = powf(fin, fval); - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* v) { - // No fp16 pow function yet - float fv = __half2float(*v); - float fval = __half2float(val); - float fout = powf(fv, fval); - *v = __float2half(fout); - } - - const half val; -}; -#endif // CUDA_HALF_TENSOR - template struct TensorTPowOp { TensorTPowOp(T v) : val(v) {} @@ -335,82 +117,25 @@ struct TensorTPowOp { template struct TensorCPowOp { - __device__ __forceinline__ void operator()(T* out, T* in) { - *out = powf((float) *out, (float) *in); - } - + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = powf((float) *in1, (float) *in2); + *out = N_::s_(N_::pow(*in1,*in2)); } -}; - -template <> -struct TensorCPowOp { - __device__ __forceinline__ void operator()(double* out, double* in) { - *out = pow(*out, *in); - } - - __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) { - *out = pow(*in1, *in2); + __device__ __forceinline__ void operator()(T* out, T* in) { + this->operator()(out, out, in); } }; -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorCPowOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - // No fp16 pow function yet - float fout = __half2float(*out); - float fin = __half2float(*in); - fout = powf(fout, fin); - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - // No fp16 pow function yet - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = powf(fin1, fin2); - *out = __float2half(fout); - } -}; -#endif // CUDA_HALF_TENSOR - template struct TensorDivOp { - __device__ __forceinline__ void - operator()(T* out, T* in) { - *out /= *in; - } - - __device__ __forceinline__ void - operator()(T* out, T* in1, T* in2) { - *out = *in1 / *in2; - } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorDivOp { - __device__ __forceinline__ void - operator()(half* out, half* in) { - // No fp16 div instruction yet - float fout = __half2float(*out); - float fin = __half2float(*in); - fout /= fin; - *out = __float2half(fout); + typedef THCNumerics N_; + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = N_::s_(N_::div(*in1,*in2)); } - - __device__ __forceinline__ void - operator()(half* out, half* in1, half* in2) { - // No fp16 div instruction yet - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = fin1 / fin2; - *out = __float2half(fout); + __device__ __forceinline__ void operator()(T* out, T* in) { + this->operator()(out, out, in); } }; -#endif // CUDA_HALF_TENSOR template struct TensorCRemainderOp { @@ -450,7 +175,7 @@ template <> struct TensorCRemainderOp { __device__ __forceinline__ void operator()(half* out, half* in) { #ifdef CUDA_HALF_INSTRUCTIONS - *out = __hsub(*out, __hmul(*in, hfloor(__hdiv(*out, *in)))); + *out = __hsub(*out, __hmul(*in, hfloor(hdiv(*out, *in)))); #else float fout = __half2float(*out); float fin = __half2float(*in); @@ -460,7 +185,7 @@ struct TensorCRemainderOp { __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { #ifdef CUDA_HALF_INSTRUCTIONS - *out = __hsub(*in1, __hmul(*in2, hfloor(__hdiv(*in1, *in2)))); + *out = __hsub(*in1, __hmul(*in2, hfloor(hdiv(*in1, *in2)))); #else float fin1 = __half2float(*in1); float fin2 = __half2float(*in2); @@ -472,102 +197,60 @@ struct TensorCRemainderOp { template struct TensorCFmodOp { - __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *out % *in; - } - + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = *in1 % *in2; - } -}; - -template <> -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = fmodf(*out, *in); - } - - __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { - *out = fmodf(*in1, *in2); - } -}; - -template <> -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(double* out, double* in) { - *out = fmod(*out, *in); - } - - __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) { - *out = fmod(*in1, *in2); + *out = N_::s_(N_::mod(*in1,*in2)); } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - *out = __float2half(fmodf(__half2float(*out), __half2float(*in))); - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - *out = __float2half(fmodf(__half2float(*in1), __half2float(*in2))); + __device__ __forceinline__ void operator()(T* out, T* in) { + this->operator()(out, out, in); } }; -#endif // CUDA_HALF_TENSOR template struct TensorClampOp { + typedef THCNumerics N_; TensorClampOp(T min, T max) : minValue(min), maxValue(max) {} __device__ __forceinline__ void operator()(T* out, T* in) { - T val = THCNumerics::lt(*in, maxValue) ? *in : maxValue; - *out = THCNumerics::gt(minValue, val) ? minValue : val; + T val = N_::lt(*in, maxValue) ? *in : maxValue; + *out = N_::gt(minValue, val) ? minValue : val; } __device__ __forceinline__ void operator()(T* v) { - T val = THCNumerics::lt(*v, maxValue) ? *v : maxValue; - *v = THCNumerics::gt(minValue, val) ? minValue : val; + T val = N_::lt(*v, maxValue) ? *v : maxValue; + *v = N_::gt(minValue, val) ? minValue : val; } - - const T minValue; - const T maxValue; + const typename N_::storage_type minValue; + const typename N_::storage_type maxValue; }; template struct TensorLerpOp { + typedef THCNumerics N_; TensorLerpOp(T w) : w(w) {} - __device__ __forceinline__ void operator()(T *out, T *a, T *b) { - *out = THCNumerics::add( - *a, - THCNumerics::mul( - w, - THCNumerics::sub(*b, *a) - ) - ); + *out = N_::add(*a, N_::mul(w, N_::sub(*b, *a))); } - - const T w; + T w; }; template struct TensorCrossOp { + typedef THCNumerics N_; TensorCrossOp(long sx, long sy, long so) : sx(sx), sy(sy), so(so) {} - __device__ __forceinline__ void operator()(T* out, T* x, T*y) { - out[0 * so] = THCNumerics::sub( - THCNumerics::mul(x[1 * sx], y[2 * sy]), - THCNumerics::mul(x[2 * sx], y[1 * sy]) + out[0 * so] = N_::sub( + N_::mul(x[1 * sx], y[2 * sy]), + N_::mul(x[2 * sx], y[1 * sy]) ); - out[1 * so] = THCNumerics::sub( - THCNumerics::mul(x[2 * sx], y[0 * sy]), - THCNumerics::mul(x[0 * sx], y[2 * sy]) + out[1 * so] = N_::sub( + N_::mul(x[2 * sx], y[0 * sy]), + N_::mul(x[0 * sx], y[2 * sy]) ); - out[2 * so] = THCNumerics::sub( - THCNumerics::mul(x[0 * sx], y[1 * sy]), - THCNumerics::mul(x[1 * sx], y[0 * sy]) + out[2 * so] = N_::sub( + N_::mul(x[0 * sx], y[1 * sy]), + N_::mul(x[1 * sx], y[0 * sy]) ); } @@ -576,36 +259,39 @@ struct TensorCrossOp { template struct TensorMaxOp { + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in) { - *out = THCNumerics::gt(*out, *in) ? *out : *in; + *out = N_::gt(*out, *in) ? *out : *in; } __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = THCNumerics::gt(*in1, *in2) ? *in1 : *in2; + *out = N_::gt(*in1, *in2) ? *in1 : *in2; } }; template struct TensorMinOp { + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in) { - *out = THCNumerics::lt(*out, *in) ? *out : *in; + *out = N_::lt(*out, *in) ? *out : *in; } __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = THCNumerics::lt(*in1, *in2) ? *in1 : *in2; + *out = N_::lt(*in1, *in2) ? *in1 : *in2; } }; template struct TensorMaxValueOp { + typedef THCNumerics N_; TensorMaxValueOp(T v) : val(v) {} __device__ __forceinline__ void operator()(T* out) { - *out = THCNumerics::gt(*out, val) ? *out : val; + *out = N_::gt(*out, val) ? *out : val; } __device__ __forceinline__ void operator()(T* out, T* in) { - *out = THCNumerics::gt(*in, val) ? *in : val; + *out = N_::gt(*in, val) ? *in : val; } T val; @@ -613,51 +299,39 @@ struct TensorMaxValueOp { template struct TensorMinValueOp { + typedef THCNumerics N_; TensorMinValueOp(T v) : val(v) {} - __device__ __forceinline__ void operator()(T* out) { - *out = THCNumerics::lt(*out, val) ? *out : val; + *out = N_::lt(*out, val) ? *out : val; } - __device__ __forceinline__ void operator()(T* out, T* in) { - *out = THCNumerics::lt(*in, val) ? *in : val; + *out = N_::lt(*in, val) ? *in : val; } - T val; }; template struct TensorAddCMulOp { + typedef THCNumerics N_; TensorAddCMulOp(T v) : val(v) {} - __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = THCNumerics::add( - *out, - THCNumerics::mul( - val, - THCNumerics::mul(*in1, *in2) - ) - ); + *out = N_::add(*out,N_::mul(val,N_::mul(*in1, *in2))); } - T val; }; template struct TensorAddCDivOp { + typedef THCNumerics N_; TensorAddCDivOp(T v) : val(v) {} - __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = THCNumerics::add( - *out, - THCNumerics::mul( - val, - THCNumerics::div(*in1, *in2) - ) - ); + *out = N_::add( *out, + N_::mul(val, + N_::div(*in1, *in2) + ) + ); } - - T val; + typename N_::storage_type val; }; #endif // THC_TENSORMATH_POINTWISE_CUH diff --git a/lib/THC/generic/THCTensorMathPairwise.cu b/lib/THC/generic/THCTensorMathPairwise.cu index 0b4094b8..b7e82021 100644 --- a/lib/THC/generic/THCTensorMathPairwise.cu +++ b/lib/THC/generic/THCTensorMathPairwise.cu @@ -63,7 +63,7 @@ THC_API void THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value) { THAssert(THCTensor_(checkGPU)(state, 2, self_, src_)); - THArgCheck(value != ScalarConvert::to(0), 3, "divide by zero"); + THArgCheck(value != THCNumConstants::zero(), 3, "divide by zero"); if (self_ == src_) { if (!THC_pointwiseApply1(state, self_, TensorDivConstantOp(value))) {