From 38df9973505161fc9af1ac69ad04b83fd4ed7fef Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Mon, 16 Jan 2017 18:43:36 -0800
Subject: [PATCH] Squashed commit of the following:

commit ec4653659f0a0301da5c8aa533d7592ffdb79329
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Mon Jan 16 18:35:45 2017 -0800

    cleanup

commit b6771be0b8f72f1ff4fba98dd9d88700cb0eeb0d
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Sun Jan 15 16:03:40 2017 -0800

    Enabling half instructions for selected Pascal archs

commit 18f5ce19c42a7981782a4c6168d4e435ffa9b20e
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Sun Jan 15 15:31:38 2017 -0800

    cleanup

commit 0114d65f649e0eb528bf45691ddd4476c4a4db8b
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Sun Jan 15 02:16:14 2017 -0800

    Build working

commit 5b7ed0477dac3f6bf08f0c11487fc95ea36f41f6
Merge: 03a77af 16fb59f
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Fri Jan 13 18:44:52 2017 -0800

    Merge remote-tracking branch 'upstream/master' into spec_refactor

    Conflicts:
    	CMakeLists.txt
    	Tensor.lua
    	generic/CStorage.c
    	generic/CTensor.c
    	init.c
    	lib/THC/THCCachingAllocator.cpp
    	lib/THC/THCGeneral.c
    	lib/THC/THCGenerateHalfType.h
    	lib/THC/THCHalf.cu
    	lib/THC/THCStorageCopy.c
    	lib/THC/THCStorageCopy.cu
    	lib/THC/THCTensorCopy.c
    	lib/THC/generic/THCStorageCopy.c
    	lib/THC/generic/THCStorageCopy.cu
    	lib/THC/generic/THCStorageCopy.h
    	lib/THC/generic/THCTensorCopy.h
    	test/test_shutdown.lua

commit 03a77af912a2b5b8d9478a1237a9d8eaaaa8ec3a
Merge: ed0f6b4 2876894
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Wed Nov 23 00:59:51 2016 -0800

    Merge branch 'android' into spec_refactor

commit ed0f6b47bfd693bde55d5d1897de46c1076fcb92
Merge: b572cf9 2d75d41
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Mon Nov 21 17:28:04 2016 -0800

    Merge branch 'upstream' into spec_refactor

commit 287689458df744e707b321bb72d6ff2635948966
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Thu Nov 17 03:08:21 2016 -0800

    Added C++ flags

commit 9328d7dd5ff31b180250a7993ad55e4a0f8e2a51
Merge: 80df9d9 10ef056
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Wed Nov 16 21:49:32 2016 -0800

    Merge branch 'cpu_half' into android

commit 80df9d995043b12d34a9f1864884a229733efc26
Merge: 0afffe1 9a6ba41
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Wed Nov 16 21:25:06 2016 -0800

    Merge branch 'getmeminfo-fix'

commit 9a6ba41117663fcd670a2f2b412383dc5d0aa291
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Tue Nov 15 14:48:52 2016 -0800

    Bugfix, test extended

commit 9aef731946f687e08c2ba4438ab985c433e3044f
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Tue Nov 15 02:46:49 2016 -0800

    Added some memory allocations to test_shutdown to test  acching allocator

commit 838ec700c17cb9f2b942c8110184fa1eef4c1301
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Tue Nov 15 02:29:53 2016 -0800

    Implemented cudaMemGetInfo for caching allocator

commit b572cf9d86a6442f3b81a8c102e4a8c670611230
Merge: 38882e6 10ef056
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Tue Nov 1 13:02:47 2016 -0700

    Merge branch 'cpu_half' into spec_refactor

commit 10ef05657e6f800127797e620c1ad2ced596d33e
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Sat Oct 29 15:54:48 2016 -0700

    Using half from TH

commit 18383174c82fbfd08a9bd8b84639aeec2e4ee22a
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Fri Oct 28 17:13:04 2016 -0700

    checkpoint

commit 38882e699756be92d3c241cdf3533ec460e72e54
Merge: acaf1a6 21ad069
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Fri Oct 21 00:42:28 2016 -0700

    cutorch refactoring

commit acaf1a64d4fed30de3fa322926cd1babcfa1cd3b
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Sun Oct 16 22:27:04 2016 -0700

    Checkpoint

commit ac964ee15f1cf434bdcb674958abaecc5018e7ba
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Fri Oct 14 01:03:47 2016 -0700

    checkpoint

commit 8a75e344ed2a8056317047b5451d5c3451f6469b
Merge: a80d28b ada3cbd
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Wed Oct 12 17:43:51 2016 -0700

    Merge remote-tracking branch 'upstream/master' into spec_refactor

commit a80d28bbc90b0aae3bd8335d3dae2a8956dcecb6
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Wed Oct 12 17:41:43 2016 -0700

    checkpoint

commit 79a900660fdff195fa9c1d88aa0865aec95ba2b2
Merge: 6933617 9efd392
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Mon Oct 10 12:42:21 2016 -0700

    Merge remote-tracking branch 'upstream/master'

commit 69336174ee3df9d680c8c80a8e30f30bcdcf2025
Merge: b928ca0 afd74e7
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Sun Oct 2 21:35:17 2016 -0700

    Merge remote-tracking branch 'upstream/master'

commit b928ca019ea061cb9ebaca2dd81450aa24f25268
Author: Boris Fomitchev <bfomitchev@nvidia.com>
Date:   Fri Sep 30 02:03:56 2016 -0700

    temporarily force pseudo-fp16 mode (HAS_HALF_INSTRUCTIONS=FALSE) for Pascal: https://github.com/torch/cutorch/issues/520
---
 lib/THC/THCHalf.h                        |  63 +-
 lib/THC/THCNumerics.cuh                  | 718 +++++------------------
 lib/THC/THCTensorMathPairwise.cu         | 222 +------
 lib/THC/THCTensorMathPointwise.cuh       | 508 +++-------------
 lib/THC/generic/THCTensorMathPairwise.cu |   2 +-
 5 files changed, 331 insertions(+), 1182 deletions(-)

diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index 7c055e7a..dbeb4311 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -1,16 +1,21 @@
 #ifndef THC_HALF_CONVERSION_INC
-#define THC_HALF_CONVERSION_INC
+# define THC_HALF_CONVERSION_INC
 
-#include "THCGeneral.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "cublas_v2.h"
+#include "cuda_fp16.h"
 
 /* We compile with CudaHalfTensor support if we have this: */
 #if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
-#define CUDA_HALF_TENSOR 1
+#  define CUDA_HALF_TENSOR 1
 #endif
 
 #ifdef CUDA_HALF_TENSOR
 
-#include <cuda_fp16.h>
+#include "THCGeneral.h"
+#include "THHalf.h"
+
 #include <stdint.h>
 
 THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
@@ -24,6 +29,54 @@ THC_API int THC_nativeHalfInstructions(THCState *state);
 /* Check for performant native fp16 support on the current device */
 THC_API int THC_fastHalfInstructions(THCState *state);
 
-#endif /* CUDA_HALF_TENSOR */
+ #  if defined (__CUDA_ARCH__)
+/* use instrintic functons defined for device only in cuda_fp16.h */
+#   define THC_FLOAT_TO_HALF(x) __float2half((float)x)
+#   define THC_HALF_TO_FLOAT(x) __half2float(x)
+#   define THC_DECL __host__ __device__ __forceinline__
+#  else
+/* use host conversion functions */
+#   define THC_FLOAT_TO_HALF(x) THC_float2half((float)x)
+#   define THC_HALF_TO_FLOAT(x) THC_half2float(x)
+#   define THC_DECL inline
+#  endif
 
+#if __CUDA_ARCH__ == 600 || __CUDA_ARCH__ >= 620
+# define CUDA_HALF_INSTRUCTIONS 1
 #endif
+
+#if defined (__cplusplus__) || defined (__CUDACC__)
+
+/// `half` has some type conversion issues associated with it, since it
+/// is a struct without a constructor/implicit conversion constructor.
+/// We use this to convert scalar values to the given type that the
+/// tensor expects.
+
+template <typename In, typename Out>
+struct ScalarConvert {
+  static THC_DECL Out to(const In& v) { return Out(v); }
+};
+
+template <typename Out>
+struct ScalarConvert<half, Out> {
+  static THC_DECL Out to(const half& v) {
+    return (Out) THC_HALF_TO_FLOAT(v);
+  }
+};
+
+template <typename In>
+struct ScalarConvert<In, half> {
+  static THC_DECL half to(const In& v) {
+    return THC_FLOAT_TO_HALF(v);
+  }
+};
+
+template <>
+struct ScalarConvert<half, half> {
+  static THC_DECL const half& to(const half& v) {
+    return v;
+  }
+};
+#  endif /* __cplusplus__ */
+# endif /* CUDA_HALF_TENSOR */
+#endif /* THC_HALF_CONVERSION_INC */
diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh
index 09443605..6b504e77 100644
--- a/lib/THC/THCNumerics.cuh
+++ b/lib/THC/THCNumerics.cuh
@@ -1,631 +1,235 @@
 #ifndef THC_NUMERICS_INC
 #define THC_NUMERICS_INC
 
-#include <cuda.h>
-#include <limits.h>
-#include "THCHalf.h"
-
-/// Class for numeric limits of the particular data type, which
-/// includes support for `half`.
-/// Unfortunately since `half` does not have a constructor, these have
-/// to be expressed as functions (either that or non-const statics).
+
+#include <limits>
+#include "THCGeneral.h"
+
 template <typename T>
-struct THCNumerics {
+struct THCNumConstants
+{
+   static THC_DECL const T one()  { return T(1); }
+   static THC_DECL const T zero() { return T(0); }
+
 };
 
 template <>
-struct THCNumerics<unsigned char> {
-  static inline __host__ __device__ unsigned char min() { return 0; }
-  static inline __host__ __device__ unsigned char max() { return UCHAR_MAX; }
-
-  static inline __host__ __device__ bool lt(unsigned char a, unsigned char b) { return a < b; }
-  static inline __host__ __device__ bool le(unsigned char a, unsigned char b) { return a <= b; }
-  static inline __host__ __device__ bool gt(unsigned char a, unsigned char b) { return a > b; }
-  static inline __host__ __device__ bool ge(unsigned char a, unsigned char b) { return a >= b; }
-  static inline __host__ __device__ bool eq(unsigned char a, unsigned char b) { return a == b; }
-  static inline __host__ __device__ bool ne(unsigned char a, unsigned char b) { return a != b; }
-
-  static inline __host__ __device__  unsigned char add(unsigned char a, unsigned char b) { return a + b; }
-  static inline __host__ __device__  unsigned char mul(unsigned char a, unsigned char b) { return a * b; }
-  static inline __host__ __device__  unsigned char sub(unsigned char a, unsigned char b) { return a - b; }
-  static inline __host__ __device__  unsigned char div(unsigned char a, unsigned char b) { return a / b; }
-  static inline __host__ __device__  unsigned char abs(unsigned char a) { return abs(a); }
+struct THCNumConstants<half>
+{
+  static THC_DECL const half one()  { half ret = THC_FLOAT_TO_HALF(1.f); return ret;} /* TODO: use literal */
+  static THC_DECL const half zero() { half ret; ret.x = 0;  return ret;}
 };
 
-template <>
-struct THCNumerics<char> {
-  static inline __host__ __device__ char min() { return CHAR_MIN; }
-  static inline __host__ __device__ char max() { return CHAR_MAX; }
-
-  static inline __host__ __device__ bool lt(char a, char b) { return a < b; }
-  static inline __host__ __device__ bool le(char a, char b) { return a <= b; }
-  static inline __host__ __device__ bool gt(char a, char b) { return a > b; }
-  static inline __host__ __device__ bool ge(char a, char b) { return a >= b; }
-  static inline __host__ __device__ bool eq(char a, char b) { return a == b; }
-  static inline __host__ __device__ bool ne(char a, char b) { return a != b; }
-
-  static inline __host__ __device__  char add(char a, char b) { return a + b; }
-  static inline __host__ __device__  char mul(char a, char b) { return a * b; }
-  static inline __host__ __device__  char sub(char a, char b) { return a - b; }
-  static inline __host__ __device__  char div(char a, char b) { return a / b; }
-  static inline __host__ __device__  char abs(char a) { return abs(a); }
+template <typename T, typename M>
+struct THCNumCommonBase {
+   typedef T storage_type;
+   /* type value should be converted to before doing math on it.
+      For most types except 16-bit floats, MathType==StorageType.
+   */
+   typedef M math_type;
+
+   /* type of math operation result , like (a*b). Usually == StorageType */
+   typedef T expr_type;
+
+  static THC_DECL  math_type m_(const storage_type& a) {
+    return ScalarConvert<storage_type, math_type>::to(a);
+  }
+  static THC_DECL  expr_type e_(const math_type& a) {
+    return ScalarConvert<math_type, expr_type>::to(a);
+  }
+  static THC_DECL  storage_type s_(const expr_type& a) {
+    return ScalarConvert<expr_type, storage_type>::to(a);
+  }
+
+  static  THC_DECL bool lt(const storage_type&  a, const storage_type&  b) { return m_(a) < m_(b);  }
+  static  THC_DECL bool le(const storage_type&  a, const storage_type&  b) { return m_(a) <= m_(b); }
+  static  THC_DECL bool gt(const storage_type&  a, const storage_type&  b) { return m_(a) > m_(b);  }
+  static  THC_DECL bool ge(const storage_type&  a, const storage_type&  b) { return m_(a) >= m_(b); }
+  static  THC_DECL bool eq(const storage_type&  a, const storage_type&  b) { return m_(a) == m_(b); }
+  static  THC_DECL bool ne(const storage_type&  a, const storage_type&  b) { return m_(a) != m_(b); }
+
+  static  THC_DECL  expr_type  add(const storage_type&  a, const storage_type&  b) { return e_(m_(a) + m_(b)); }
+  static  THC_DECL  expr_type  mul(const storage_type&  a, const storage_type&  b) { return e_(m_(a) * m_(b)); }
+  static  THC_DECL  expr_type  sub(const storage_type&  a, const storage_type&  b) { return e_(m_(a) - m_(b)); }
+  static  THC_DECL  expr_type  div(const storage_type&  a, const storage_type&  b) { return e_(m_(a) / m_(b)); }
+  static  THC_DECL  expr_type  abs(const storage_type&  a) { bool isneg = (a<0); return e_(isneg ? -a  : a); }
+  static  THC_DECL  expr_type  neg(const storage_type& a) { return  e_(-m_(a)); }
+  static  THC_DECL  expr_type  pow (const storage_type& a, T b) { return e_(::pow((double)a, (double)b)); }
+  static  THC_DECL  expr_type  mod(const storage_type&  a, const storage_type&  b) { return e_(m_(a) % m_(b)); }
+   static THC_DECL const T min()  { return std::numeric_limits<T>::min(); }
+   static THC_DECL const T max()  { return std::numeric_limits<T>::max(); }
 };
 
-template <>
-struct THCNumerics<short> {
-  static inline __host__ __device__ short min() { return SHRT_MIN; }
-  static inline __host__ __device__ short max() { return SHRT_MAX; }
-
-  static inline __host__ __device__ bool lt(short a, short b) { return a < b; }
-  static inline __host__ __device__ bool le(short a, short b) { return a <= b; }
-  static inline __host__ __device__ bool gt(short a, short b) { return a > b; }
-  static inline __host__ __device__ bool ge(short a, short b) { return a >= b; }
-  static inline __host__ __device__ bool eq(short a, short b) { return a == b; }
-  static inline __host__ __device__ bool ne(short a, short b) { return a != b; }
-
-  static inline __host__ __device__  short add(short a, short b) { return a + b; }
-  static inline __host__ __device__  short mul(short a, short b) { return a * b; }
-  static inline __host__ __device__  short sub(short a, short b) { return a - b; }
-  static inline __host__ __device__  short div(short a, short b) { return a / b; }
-  static inline __host__ __device__  short abs(short a) { return abs(a); }
+template <typename T, typename M, bool is_int>
+struct THCNumBase {};
+
+template <typename T, typename M>
+struct THCNumBase<T, M, true>  : public THCNumCommonBase<T, M> {
 };
 
 template <>
-struct THCNumerics<int> {
-  static inline __host__ __device__ int min() { return INT_MIN; }
-  static inline __host__ __device__ int max() { return INT_MAX; }
-
-  static inline __host__ __device__ bool lt(int a, int b) { return a < b; }
-  static inline __host__ __device__ bool le(int a, int b) { return a <= b; }
-  static inline __host__ __device__ bool gt(int a, int b) { return a > b; }
-  static inline __host__ __device__ bool ge(int a, int b) { return a >= b; }
-  static inline __host__ __device__ bool eq(int a, int b) { return a == b; }
-  static inline __host__ __device__ bool ne(int a, int b) { return a != b; }
-
-  static inline __host__ __device__  int add(int a, int b) { return a + b; }
-  static inline __host__ __device__  int mul(int a, int b) { return a * b; }
-  static inline __host__ __device__  int sub(int a, int b) { return a - b; }
-  static inline __host__ __device__  int div(int a, int b) { return a / b; }
-  static inline __host__ __device__  int abs(int a) { return ::abs(a); }
+struct THCNumBase<long, long, true>  : public THCNumCommonBase<long, long> {
+  static  THC_DECL  expr_type  abs(const storage_type&  a) { return labs(a); }
 };
 
-template <>
-struct THCNumerics<long> {
-  static inline __host__ __device__ long min() { return LONG_MIN; }
-  static inline __host__ __device__ long max() { return LONG_MAX; }
-
-  static inline __host__ __device__ bool lt(long a, long b) { return a < b; }
-  static inline __host__ __device__ bool le(long a, long b) { return a <= b; }
-  static inline __host__ __device__ bool gt(long a, long b) { return a > b; }
-  static inline __host__ __device__ bool ge(long a, long b) { return a >= b; }
-  static inline __host__ __device__ bool eq(long a, long b) { return a == b; }
-  static inline __host__ __device__ bool ne(long a, long b) { return a != b; }
-
-  static inline __host__ __device__  long add(long a, long b) { return a + b; }
-  static inline __host__ __device__  long mul(long a, long b) { return a * b; }
-  static inline __host__ __device__  long sub(long a, long b) { return a - b; }
-  static inline __host__ __device__  long div(long a, long b) { return a / b; };
-  static inline __host__ __device__  long abs(long a) { return labs(a); }
+template <typename T, typename M>
+struct THCNumBase<T, M, false> : public THCNumCommonBase<T, M> {
+  typedef THCNumCommonBase<T, M> Base;
+  using Base::e_;
+  using Base::m_;
+  using Base::s_;
+  using typename Base::math_type;
+  using typename Base::expr_type;
+  using typename Base::storage_type;
+
+  static THC_DECL const T min()  { return -std::numeric_limits<T>::max(); }
+  static THC_DECL const T max()  { return std::numeric_limits<T>::max(); }
+
+
+  static  THC_DECL  expr_type exp  (const storage_type& a) { return  e_(::exp(m_(a))); }
+  static  THC_DECL  expr_type log  (const storage_type& a) { return  e_(::log(m_(a))); }
+  static  THC_DECL  expr_type log1p(const storage_type& a) { return  e_(::log1p(m_(a))); }
+  static  THC_DECL  expr_type cos  (const storage_type& a) { return  e_(::cos(m_(a))); }
+  static  THC_DECL  expr_type sin  (const storage_type& a) { return  e_(::sin(m_(a))); }
+  static  THC_DECL  expr_type sqrt (const storage_type& a) { return  e_(::sqrt(m_(a))); }
+  static  THC_DECL  expr_type rsqrt(const storage_type& a) { return  e_(::rsqrt(m_(a))); }
+  static  THC_DECL  expr_type ceil (const storage_type& a) { return  e_(::ceil(m_(a))); }
+  static  THC_DECL  expr_type floor(const storage_type& a) { return  e_(::floor(m_(a))); }
+  static  THC_DECL  expr_type trunc(const storage_type& a) { return  e_(::trunc(m_(a))); }
+  static  THC_DECL  expr_type acos (const storage_type& a) { return  e_(::acos(m_(a))); }
+  static  THC_DECL  expr_type cosh (const storage_type& a) { return  e_(::cosh(m_(a))); }
+  static  THC_DECL  expr_type acosh(const storage_type& a) { return  e_(::acosh(m_(a))); }
+  static  THC_DECL  expr_type asin (const storage_type& a) { return  e_(::asin(m_(a))); }
+  static  THC_DECL  expr_type sinh (const storage_type& a) { return  e_(::sinh(m_(a))); }
+  static  THC_DECL  expr_type asinh(const storage_type& a) { return  e_(::asinh(m_(a))); }
+  static  THC_DECL  expr_type tan  (const storage_type& a) { return  e_(::tan(m_(a))); }
+  static  THC_DECL  expr_type atan (const storage_type& a) { return  e_(::atan(m_(a))); }
+  static  THC_DECL  expr_type tanh (const storage_type& a) { return  e_(::tanh(m_(a))); }
+  static  THC_DECL  expr_type abs  (const storage_type& a) { return  e_(::abs(m_(a))); }
+  static  THC_DECL  expr_type round(const storage_type& a) { return  e_(::round(m_(a))); }
+  static  THC_DECL  expr_type frac (const storage_type& a) { return  e_(m_(a) - ::trunc(m_(a))); }
+  static  THC_DECL  expr_type cinv (const storage_type& a) { return  Base::div(THCNumConstants<T>::one(), a); }
+  static  THC_DECL  expr_type pow  (const storage_type& a, T b) { return e_(::pow(m_(a), m_(b))); }
+  static  THC_DECL  expr_type mod  (const storage_type& a, const storage_type& b) { return  e_(::fmod(m_(a), m_(b))); }
+
+};
+
+template <typename T>
+struct THCNumerics: public THCNumBase<T, T, std::numeric_limits<T>::is_integer> {
+  typedef THCNumCommonBase<T, T> Base;
+  using typename Base::math_type;
+  using typename Base::expr_type;
+  using typename Base::storage_type;
+  typedef THCNumConstants<T> Constants;
 };
 
 #ifdef CUDA_HALF_TENSOR
-template <>
-struct THCNumerics<half> {
-  static inline __host__ __device__ half min() { half h; h.x = 0xfbff; return h; }
-  static inline __host__ __device__ half max() { half h; h.x = 0x7bff; return h; }
 
-  static inline __host__ __device__ bool lt(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+template <>
+struct THCNumerics<half>: public THCNumBase<half, float, false>  {
+  static THC_DECL const half min() { half ret; ret.x = 0xFBFF;  return ret; }
+  static THC_DECL const half max() { half ret; ret.x = 0x7BFF;  return ret; }
+  typedef THCNumConstants<half> Constants;
+
+# ifndef CUDA_HALF_INSTRUCTIONS
+  typedef THCNumCommonBase<half, float> Base;
+  using typename Base::math_type;
+  using typename Base::expr_type;
+  using typename Base::storage_type;
+  using Base::e_;
+  using Base::m_;
+  using Base::s_;
+# else
+  typedef THCNumCommonBase<half, half> Base;
+  typedef typename Base::storage_type storage_type;
+  typedef typename Base::math_type    math_type;
+  typedef typename Base::expr_type    expr_type;
+  static THC_DECL  math_type m_(const storage_type& a) {
+    return ScalarConvert<storage_type, math_type>::to(a);
+  }
+  static THC_DECL  expr_type e_(const math_type& a) {
+    return ScalarConvert<math_type, expr_type>::to(a);
+  }
+  static THC_DECL  storage_type s_(const expr_type& a) {
+    return ScalarConvert<expr_type, storage_type>::to(a);
+  }
+
+  static  THC_DECL bool lt(const half& a, const half& b) {
     return __hlt(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa < fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) < THC_half2float(b);
-#endif
   }
-
-  static inline __host__ __device__ bool le(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL bool le(const half& a, const half& b) {
     return __hle(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa <= fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) <= THC_half2float(b);
-#endif
   }
 
-  static inline __host__ __device__ bool gt(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL bool gt(const half& a, const half& b) {
     return __hgt(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa > fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) > THC_half2float(b);
-#endif
   }
 
-  static inline __host__ __device__ bool ge(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL bool ge(const half& a, const half& b) {
     return __hge(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa >= fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) >= THC_half2float(b);
-#endif
   }
 
-  static inline __host__ __device__ bool eq(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL bool eq(const half& a, const half& b) {
     return __heq(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa == fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) == THC_half2float(b);
-#endif
   }
 
-  static inline __host__ __device__ bool ne(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL bool ne(const half& a, const half& b) {
     return __hne(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa != fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) != THC_half2float(b);
-#endif
   }
-
-  static inline __host__ __device__ half exp(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half exp(const half& a) {
     return hexp(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(expf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(expf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half log(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half log(const half& a) {
     return hlog(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(logf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(logf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half log1p(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(log1pf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(log1pf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half cos(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half cos(const half& a) {
     return hcos(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(cosf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(cosf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half sin(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half sin(const half& a) {
     return hsin(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(sinf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(sinf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half sqrt(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half sqrt(const half& a) {
     return hsqrt(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(sqrtf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(sqrtf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half rsqrt(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half rsqrt(const half& a) {
     return hrsqrt(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(rsqrtf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(rsqrtf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half ceil(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half ceil(const half& a) {
     return hceil(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(ceilf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(ceilf(THC_half2float(a)));
-#endif
   }
 
-  static inline __host__ __device__ half floor(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half floor(const half& a) {
     return hfloor(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(floorf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(floorf(THC_half2float(a)));
-#endif
   }
 
-  static inline __host__ __device__ half trunc(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half trunc(const half& a) {
     return htrunc(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(truncf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(truncf(THC_half2float(a)));
-#endif
   }
 
-  static inline __host__ __device__ half neg(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half neg(const half& a) {
     return __hneg(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(-fa);
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(-(THC_half2float(a)));
-#endif
   }
 
-  static inline __host__ __device__ half acos(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(acosf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(acosf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half cosh(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(coshf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(coshf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half asin(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(asinf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(asinf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half sinh(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(sinhf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(sinhf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half tan(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(tanf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(tanf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half atan(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(atanf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(atanf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half tanh(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(tanhf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(tanhf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half abs(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(fabs(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(fabs(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half round(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(roundf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(roundf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half frac(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(fa - truncf(fa));
-#else // __CUDA_ARCH__
-    float fa = THC_half2float(a);
-    return THC_float2half(fa - floorf(fa));
-#endif
-  }
-
-  static inline __host__ __device__ half cinv(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(1.0f / fa);
-#else // __CUDA_ARCH__
-    return THC_float2half(1.0f / THC_half2float(a));
-#endif
-  }
-
-  static inline __host__ __device__ half add(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL const half& add(const half& a, const half& b) {
     return __hadd(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return __float2half( fa + fb );
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(THC_half2float(a) + THC_half2float(b));
-#endif
   }
-
-  static inline __host__ __device__ half div(half a, half b) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return __float2half( fa / fb );
-#else // __CUDA_ARCH__
-    return THC_float2half(THC_half2float(a) / THC_half2float(b));
-#endif
-  }
-
-  static inline __host__ __device__ half mul(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half mul(const half& a, const half& b) {
     return __hmul(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return __float2half( fa * fb );
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(THC_half2float(a) * THC_half2float(b));
-#endif
   }
 
-  static inline __host__ __device__ half sub(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static  THC_DECL half sub(const half& a, const half& b) {
     return __hsub(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return __float2half( fa - fb );
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(THC_half2float(a) - THC_half2float(b));
-#endif
   }
 
-  static inline __host__ __device__ half pow(half a, half b) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return __float2half(powf(fa, fb));
-#else // __CUDA_ARCH__
-    return THC_float2half(powf(THC_half2float(a), THC_half2float(b)));
-#endif
-  }
-
-};
-#endif
-
-template <>
-struct THCNumerics<float> {
-  static inline __host__ __device__ float min() { return -FLT_MAX; }
-  static inline __host__ __device__ float max() { return FLT_MAX; }
-
-  static inline __host__ __device__ bool lt(float a, float b) { return a < b; }
-  static inline __host__ __device__ bool le(float a, float b) { return a <= b; }
-  static inline __host__ __device__ bool gt(float a, float b) { return a > b; }
-  static inline __host__ __device__ bool ge(float a, float b) { return a >= b; }
-  static inline __host__ __device__ bool eq(float a, float b) { return a == b; }
-  static inline __host__ __device__ bool ne(float a, float b) { return a != b; }
-
-  static inline __host__ __device__  float exp  (float a) { return   expf(a); }
-  static inline __host__ __device__  float log  (float a) { return   logf(a); }
-  static inline __host__ __device__  float log1p(float a) { return log1pf(a); }
-  static inline __host__ __device__  float cos  (float a) { return   cosf(a); }
-  static inline __host__ __device__  float sin  (float a) { return   sinf(a); }
-  static inline __host__ __device__  float sqrt (float a) { return  sqrtf(a); }
-  static inline __host__ __device__  float rsqrt(float a) { return rsqrtf(a); }
-  static inline __host__ __device__  float ceil (float a) { return  ceilf(a); }
-  static inline __host__ __device__  float floor(float a) { return floorf(a); }
-  static inline __host__ __device__  float trunc(float a) { return truncf(a); }
-  static inline __host__ __device__  float neg  (float a) { return        -a; }
-  static inline __host__ __device__  float acos (float a) { return  acosf(a); }
-  static inline __host__ __device__  float cosh (float a) { return  coshf(a); }
-  static inline __host__ __device__  float acosh(float a) { return acoshf(a); }
-  static inline __host__ __device__  float asin (float a) { return  asinf(a); }
-  static inline __host__ __device__  float sinh (float a) { return  sinhf(a); }
-  static inline __host__ __device__  float asinh(float a) { return asinhf(a); }
-  static inline __host__ __device__  float tan  (float a) { return   tanf(a); }
-  static inline __host__ __device__  float atan (float a) { return  atanf(a); }
-  static inline __host__ __device__  float tanh (float a) { return  tanhf(a); }
-  static inline __host__ __device__  float abs  (float a) { return   fabs(a); }
-  static inline __host__ __device__  float round(float a) { return roundf(a); }
-  static inline __host__ __device__  float frac (float a) { return a - truncf(a); }
-  static inline __host__ __device__  float cinv (float a) { return 1.0f / a; }
-  static inline __host__ __device__  float add  (float a, float b) { return a + b; }
-  static inline __host__ __device__  float div  (float a, float b) { return a / b; }
-  static inline __host__ __device__  float mul  (float a, float b) { return a * b; }
-  static inline __host__ __device__  float sub  (float a, float b) { return a - b; }
-  static inline __host__ __device__  float pow  (float a, float b) { return powf(a, b); }
-};
-
-template <>
-struct THCNumerics<double> {
-  static inline __host__ __device__ double min() { return -DBL_MAX; }
-  static inline __host__ __device__ double max() { return DBL_MAX; }
-
-  static inline __host__ __device__ bool lt(double a, double b) { return a < b; }
-  static inline __host__ __device__ bool le(double a, double b) { return a <= b; }
-  static inline __host__ __device__ bool gt(double a, double b) { return a > b; }
-  static inline __host__ __device__ bool ge(double a, double b) { return a >= b; }
-  static inline __host__ __device__ bool eq(double a, double b) { return a == b; }
-  static inline __host__ __device__ bool ne(double a, double b) { return a != b; }
-
-  static inline __host__ __device__  double exp  (double a) { return   ::exp(a); }
-  static inline __host__ __device__  double log  (double a) { return   ::log(a); }
-  static inline __host__ __device__  double log1p(double a) { return ::log1p(a); }
-  static inline __host__ __device__  double cos  (double a) { return   ::cos(a); }
-  static inline __host__ __device__  double sin  (double a) { return   ::sin(a); }
-  static inline __host__ __device__  double sqrt (double a) { return  ::sqrt(a); }
-  static inline __host__ __device__  double rsqrt(double a) { return ::rsqrt(a); }
-  static inline __host__ __device__  double ceil (double a) { return  ::ceil(a); }
-  static inline __host__ __device__  double floor(double a) { return ::floor(a); }
-  static inline __host__ __device__  double trunc(double a) { return ::trunc(a); }
-  static inline __host__ __device__  double neg  (double a) { return       -a; }
-  static inline __host__ __device__  double acos (double a) { return  ::acos(a); }
-  static inline __host__ __device__  double cosh (double a) { return  ::cosh(a); }
-  static inline __host__ __device__  double acosh(double a) { return ::acosh(a); }
-  static inline __host__ __device__  double asin (double a) { return  ::asin(a); }
-  static inline __host__ __device__  double sinh (double a) { return  ::sinh(a); }
-  static inline __host__ __device__  double asinh(double a) { return ::asinh(a); }
-  static inline __host__ __device__  double tan  (double a) { return   ::tan(a); }
-  static inline __host__ __device__  double atan (double a) { return  ::atan(a); }
-  static inline __host__ __device__  double tanh (double a) { return  ::tanh(a); }
-  static inline __host__ __device__  double abs  (double a) { return   ::abs(a); }
-  static inline __host__ __device__  double round(double a) { return ::round(a); }
-  static inline __host__ __device__  double frac (double a) { return a - ::trunc(a); }
-  static inline __host__ __device__  double cinv (double a) { return 1.0 / a; }
-  static inline __host__ __device__  double add  (double a, double b) { return a + b; }
-  static inline __host__ __device__  double div  (double a, double b) { return a / b; }
-  static inline __host__ __device__  double mul  (double a, double b) { return a * b; }
-  static inline __host__ __device__  double sub  (double a, double b) { return a - b; }
-  static inline __host__ __device__  double pow  (double a, double b) { return ::pow(a, b); }
-};
-
-/// `half` has some type conversion issues associated with it, since it
-/// is a struct without a constructor/implicit conversion constructor.
-/// We use this to convert scalar values to the given type that the
-/// tensor expects.
-template <typename In, typename Out>
-struct ScalarConvert {
-  static __host__ __device__ Out to(const In v) { return (Out) v; }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <typename Out>
-struct ScalarConvert<half, Out> {
-  static __host__ __device__ Out to(const half v) {
-#ifdef __CUDA_ARCH__
-    return (Out) __half2float(v);
-#else
-    return (Out) THC_half2float(v);
-#endif
+  static  THC_DECL half div  (const half& a, const half& b) { 
+    return  hdiv(a,b); 
   }
-};
-
-template <typename In>
-struct ScalarConvert<In, half> {
-  static __host__ __device__ half to(const In v) {
-#ifdef __CUDA_ARCH__
-    return __float2half((float) v);
-#else
-    return THC_float2half((float) v);
-#endif
+  static  THC_DECL half mod  (const half& a, const half& b) { 
+    return __float2half(fmodf(__half2float(a), __half2float(b)));
   }
+# endif
 };
 
-template <>
-struct ScalarConvert<half, half> {
-  static __host__ __device__ half to(const half v) {
-    return v;
-  }
-};
 #endif
 
-#endif // THC_NUMERICS_INC
+#endif // THC_NUMERICS_INC
diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu
index 094cf0b8..1511fb88 100644
--- a/lib/THC/THCTensorMathPairwise.cu
+++ b/lib/THC/THCTensorMathPairwise.cu
@@ -1,6 +1,5 @@
-#include "THCTensorMath.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "THCTensorMath.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCNumerics.cuh"
@@ -8,238 +7,57 @@
 
 template <typename T>
 struct TensorAddConstantOp {
+  typedef THCNumerics<T> N_;
   TensorAddConstantOp(T v) : val(v) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *in + val;
+    *out = N_::s_(N_::add(*in, val));
   }
-
   __device__ __forceinline__ void operator()(T* v) {
-    *v += val;
+        this->operator()(v, v);
   }
-
-  const T val;
+  const typename N_::storage_type val;
 };
 
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorAddConstantOp<half> {
-#ifdef CUDA_HALF_INSTRUCTIONS
-  TensorAddConstantOp(half v) : val(v) {}
-#else
-  TensorAddConstantOp(half v) : fval(THC_half2float(v)) {}
-#endif
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*in, val);
-#else
-    float fin = __half2float(*in);
-    float fout = fin + fval;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *v = __hadd(*v, val);
-#else
-    float fv = __half2float(*v);
-    fv += fval;
-    *v = __float2half(fv);
-#endif
-  }
-
-#ifdef CUDA_HALF_INSTRUCTIONS
-  const half val;
-#else
-  const float fval;
-#endif
-};
-#endif // CUDA_HALF_TENSOR
-
-
 template <typename T>
 struct TensorSubConstantOp {
+  typedef THCNumerics<T> N_;
   TensorSubConstantOp(T v) : val(v) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *in - val;
+    *out = N_::s_(N_::sub(*in, val));
   }
-
   __device__ __forceinline__ void operator()(T* v) {
-    *v -= val;
-  }
-
-  const T val;
-};
-
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorSubConstantOp<half> {
-#ifdef CUDA_HALF_INSTRUCTIONS
-  TensorSubConstantOp(half v): val(THC_float2half(-(THC_half2float(v)))) {}
-#else
-  TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {}
-#endif
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*in, val);
-#else
-    float fin = __half2float(*in);
-    float fout = fin + fval;
-    *out = __float2half(fout);
-#endif
+        this->operator()(v, v);
   }
-
-  __device__ __forceinline__ void operator()(half* v) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *v = __hadd(*v, val);
-#else
-    float fv = __half2float(*v);
-    fv += fval;
-    *v = __float2half(fv);
-#endif
-  }
-
-#ifdef CUDA_HALF_INSTRUCTIONS
-  const half val;
-#else
-  const float fval;
-#endif
+  const typename N_::storage_type val;
 };
-#endif // CUDA_HALF_TENSOR
 
 
 template <typename T>
 struct TensorMulConstantOp {
   TensorMulConstantOp(T v) : val(v) {}
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *in * val;
+    *out = N_::s_(N_::mul(*in, val));
   }
-
   __device__ __forceinline__ void operator()(T* v) {
-    *v *= val;
+        this->operator()(v, v);
   }
-
-  const T val;
+  const typename N_::storage_type val;
 };
 
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorMulConstantOp<half> {
-#ifdef CUDA_HALF_INSTRUCTIONS
-  TensorMulConstantOp(half v) : val(v) {}
-#else
-  TensorMulConstantOp(half v) : fval(THC_half2float(v)) {}
-#endif
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hmul(*in, val);
-#else
-    float fin = __half2float(*in);
-    float fout = fin * fval;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *v = __hmul(*v, val);
-#else
-    float fv = __half2float(*v);
-    fv *= fval;
-    *v = __float2half(fv);
-#endif
-  }
-
-#ifdef CUDA_HALF_INSTRUCTIONS
-  const half val;
-#else
-  const float fval;
-#endif
-};
-#endif // CUDA_HALF_TENSOR
-
 template <typename T>
 struct TensorDivConstantOp {
-  TensorDivConstantOp(T v) : val(v) {}
+  typedef THCNumerics<T> N_;
+  TensorDivConstantOp(const T& v) : val(N_::div(N_::Constants::one(), v)) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *in / val;
+    *out = N_::s_(N_::div(*in, val));
   }
-
   __device__ __forceinline__ void operator()(T* v) {
-    *v /= val;
+        this->operator()(v, v);
   }
-
-  const T val;
+  const typename N_::storage_type val;
 };
 
-template <>
-struct TensorDivConstantOp<float> {
-  TensorDivConstantOp(float v) : val(1.f / v) {}
-  __device__ __forceinline__ void operator()(float* out, float* in) {
-    *out = *in * val;
-  }
-
-  __device__ __forceinline__ void operator()(float* v) {
-    *v *= val;
-  }
-
-  const float val;
-};
-
-template <>
-struct TensorDivConstantOp<double> {
-  TensorDivConstantOp(double v) : val(1. / v) {}
-  __device__ __forceinline__ void operator()(double* out, double* in) {
-    *out = *in * val;
-  }
-
-  __device__ __forceinline__ void operator()(double* v) {
-    *v *= val;
-  }
-
-  const double val;
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorDivConstantOp<half> {
-#ifdef CUDA_HALF_INSTRUCTIONS
-  TensorDivConstantOp(half v) : val(ScalarInv<half>::to(v)) {}
-#else
-  TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {}
-#endif
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hmul(*in, val);
-#else
-    float fin = __half2float(*in);
-    float fout = fin * fval;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *v = __hmul(*v, val);
-#else
-    float fv = __half2float(*v);
-    fv *= fval;
-    *v = __float2half(fv);
-#endif
-  }
-
-#ifdef CUDA_HALF_INSTRUCTIONS
-  const half val;
-#else
-  const float fval;
-#endif
-};
-#endif // CUDA_HALF_TENSOR
-
 template <typename T>
 struct TensorRemainderOp {
   TensorRemainderOp(T v) : val(v) {}
@@ -293,7 +111,7 @@ struct TensorRemainderOp<half> {
 
   __device__ __forceinline__ void operator()(half* out, half* in) {
 #ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hsub(*in,  __hmul(val, hfloor(__hdiv(*in,  val))));
+    *out = __hsub(*in,  __hmul(val, hfloor(hdiv(*in,  val))));
 #else
     float fin = __half2float(*in);
     float fout = fin - fval * floorf(fin / fval);
@@ -303,7 +121,7 @@ struct TensorRemainderOp<half> {
 
   __device__ __forceinline__ void operator()(half* v) {
 #ifdef CUDA_HALF_INSTRUCTIONS
-    *v = __hsub(*v, __hmul(val, hfloor(__hdiv(*v, val))));
+    *v = __hsub(*v, __hmul(val, hfloor(hdiv(*v, val))));
 #else
     float fv = __half2float(*v);
     fv = fv - fval * floorf(fv / fval);
diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh
index de96cadd..f9499c5e 100644
--- a/lib/THC/THCTensorMathPointwise.cuh
+++ b/lib/THC/THCTensorMathPointwise.cuh
@@ -1,9 +1,8 @@
 #ifndef THC_TENSORMATH_POINTWISE_CUH
 #define THC_TENSORMATH_POINTWISE_CUH
 
-#include "THCTensorMath.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "THCTensorMath.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCNumerics.cuh"
@@ -11,313 +10,96 @@
 
 template <typename T>
 struct TensorSigmoidOp {
+  typedef THCNumerics<T> N_;
+  typedef typename N_::Constants NC_;
   __device__ __forceinline__ void operator()(T* out, T* in) const {
-    T one = (T) 1.0;
-    *out = one / (one + THCNumerics<T>::exp(- *in));
+    *out = N_::div(NC_::one(), N_::add(NC_::one(), N_::exp(N_::neg(*in))));
   }
-
   __device__ __forceinline__ void operator()(T* v) const {
-    T one = (T) 1.0;
-    *v = one / (one + THCNumerics<T>::exp(- *v));
+     this->operator()(v, v);
   }
 };
 
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorSigmoidOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) const {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    half one = ScalarConvert<int, half>::to(1);
-    *out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
-#else
-    float fin = __half2float(*in);
-    *out = __float2half(1.0f / (1.0f + expf(- fin)));
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* v) const {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    half one = ScalarConvert<int, half>::to(1);
-    *v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
-#else
-    float fv = __half2float(*v);
-    *v = __float2half(1.0f / (1.0f + expf(- fv)));
-#endif
-  }
-};
-#endif
-
 template <typename T>
 struct TensorSignOp {
+  typedef THCNumerics<T> N_;
+  typedef THCNumConstants<typename N_::storage_type> NC_;
+
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    T orig = *in;
-    *out = (orig > 0) - (orig < 0);
+    const T& orig = *in;
+    *out = (N_::gt(orig, NC_::zero()) ? NC_::one() :
+            N_::lt(orig, NC_::zero()) ? N_::neg(NC_::one()) :
+            NC_::zero());
   }
-
   __device__ __forceinline__ void operator()(T* v) {
-    T orig = *v;
-    *v = (orig > 0) - (orig < 0);
+     this->operator()(v, v);
   }
 };
 
-template <>
-struct TensorSignOp<unsigned char> {
-  __device__ __forceinline__ void operator()(unsigned char* out, unsigned char* in) {
-    unsigned char orig = *in;
-    *out = (orig == 0) ? 0 : 1;
-  }
-
-  __device__ __forceinline__ void operator()(unsigned char* v) {
-    unsigned char orig = *v;
-    *v = (orig == 0) ? 0 : 1;
-  }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorSignOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    half zero = ScalarConvert<int, half>::to(0);
-    half orig = *in;
-    *out = __float2half((float) __hgt(orig, zero) - (float) __hlt(orig, zero));
-#else
-    float orig = __half2float(*in);
-    *out = __float2half((orig > 0) - (orig < 0));
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    half zero = ScalarConvert<int, half>::to(0);
-    half orig = *v;
-    *v = __float2half((float) __hgt(orig, zero) -  (float) __hlt(orig, zero));
-#else
-    float orig = __half2float(*v);
-    *v = __float2half((orig > 0) - (orig < 0));
-#endif
-  }
-};
-#endif
-
 template <typename T>
 struct TensorAddOp {
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out += *in;
-  }
-
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in1 + *in2;
+    *out = N_::s_(N_::add(*in1, *in2));
   }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorAddOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*out, *in);
-#else
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout += fin;
-    *out = __float2half(fout);
-#endif
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    this->operator()(out, in, out);
   }
 
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*in1, *in2);
-#else
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = fin1 + fin2;
-    *out = __float2half(fout);
-#endif
-  }
 };
-#endif // CUDA_HALF_TENSOR
+
 
 template <typename T>
 struct TensorCAddOp {
+  typedef THCNumerics<T> N_;
   TensorCAddOp(T v) : val(v) {}
-
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out += val * *in;
+    this->operator()(out, out, in);
   }
-
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in1 + val * *in2;
+    *out = N_::s_(N_::add(*in1, N_::mul(val, *in2)));
   }
 
-  T val;
+ const typename N_::storage_type val;
 };
 
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorCAddOp<half> {
-  TensorCAddOp(half v) : val(v) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*out, __hmul(val, *in));
-#else
-    float fout = __half2float(*out);
-    float fval = __half2float(val);
-    float fin = __half2float(*in);
-
-    fout += fval * fin;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*in1, __hmul(val, *in2));
-#else
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fval = __half2float(val);
-
-    float fout = fin1 + fval * fin2;
-    *out = __float2half(fout);
-#endif
-  }
-
-  half val;
-};
-#endif // CUDA_HALF_TENSOR
 
 template <typename T>
 struct TensorSubOp {
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out -= *in;
-  }
-
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in1 - *in2;
+    *out = N_::s_(N_::sub(*in1, *in2));
   }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorSubOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hsub(*out, *in);
-#else
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout -= fin;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hsub(*in1, *in2);
-#else
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = fin1 - fin2;
-    *out = __float2half(fout);
-#endif
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    this->operator()(out, out, in);
   }
 };
-#endif // CUDA_HALF_TENSOR
 
 template <typename T>
 struct TensorMulOp {
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out *= *in;
-  }
-
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in1 * *in2;
+    *out = N_::s_(N_::mul(*in1, *in2));
   }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorMulOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hmul(*out, *in);
-#else
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout *= fin;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hmul(*in1, *in2);
-#else
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = fin1 * fin2;
-    *out = __float2half(fout);
-#endif
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    this->operator()(out, out, in);
   }
 };
-#endif // CUDA_HALF_TENSOR
 
 template<typename T>
 struct TensorPowOp {
-  TensorPowOp(T v) : val(v) {}
+  typedef THCNumerics<T> N_;
+  TensorPowOp(T v) : val(N_::s_(v)) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = powf((float) *in, (float) val);
+    *out = N_::s_(N_::pow(*in, val));
   }
 
   __device__ __forceinline__ void operator()(T* v) {
-    *v = powf((float) *v, (float) val);
+    this->operator()(v, v);
   }
-
-  const T val;
+ const typename N_::storage_type val;
 };
 
-template <>
-struct TensorPowOp<double> {
-  TensorPowOp(double v) : val(v) {}
-
-  __device__ __forceinline__ void operator()(double* out, double* in) {
-    *out = pow(*in, val);
-  }
-
-  __device__ __forceinline__ void operator()(double* v) {
-    *v = pow(*v, val);
-  }
-
-  const double val;
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorPowOp<half> {
-  TensorPowOp(half v) : val(v) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    // No fp16 pow function yet
-    float fin = __half2float(*in);
-    float fval = __half2float(val);
-    float fout = powf(fin, fval);
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    // No fp16 pow function yet
-    float fv = __half2float(*v);
-    float fval = __half2float(val);
-    float fout = powf(fv, fval);
-    *v = __float2half(fout);
-  }
-
-  const half val;
-};
-#endif // CUDA_HALF_TENSOR
-
 template<typename T>
 struct TensorTPowOp {
   TensorTPowOp(T v) : val(v) {}
@@ -335,82 +117,25 @@ struct TensorTPowOp {
 
 template <typename T>
 struct TensorCPowOp {
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = powf((float) *out, (float) *in);
-  }
-
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = powf((float) *in1, (float) *in2);
+    *out = N_::s_(N_::pow(*in1,*in2));
   }
-};
-
-template <>
-struct TensorCPowOp<double> {
-  __device__ __forceinline__ void operator()(double* out, double* in) {
-    *out = pow(*out, *in);
-  }
-
-  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
-    *out = pow(*in1, *in2);
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    this->operator()(out, out, in);
   }
 };
 
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorCPowOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    // No fp16 pow function yet
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout = powf(fout, fin);
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    // No fp16 pow function yet
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = powf(fin1, fin2);
-    *out = __float2half(fout);
-  }
-};
-#endif // CUDA_HALF_TENSOR
-
 template <typename T>
 struct TensorDivOp {
-  __device__ __forceinline__ void
-  operator()(T* out, T* in) {
-    *out /= *in;
-  }
-
-  __device__ __forceinline__ void
-  operator()(T* out, T* in1, T* in2) {
-    *out = *in1 / *in2;
-  }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorDivOp<half> {
-  __device__ __forceinline__ void
-  operator()(half* out, half* in) {
-    // No fp16 div instruction yet
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout /= fin;
-    *out = __float2half(fout);
+  typedef THCNumerics<T> N_;
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = N_::s_(N_::div(*in1,*in2));
   }
-
-  __device__ __forceinline__ void
-  operator()(half* out, half* in1, half* in2) {
-    // No fp16 div instruction yet
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = fin1 / fin2;
-    *out = __float2half(fout);
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    this->operator()(out, out, in);
   }
 };
-#endif // CUDA_HALF_TENSOR
 
 template <typename T>
 struct TensorCRemainderOp {
@@ -450,7 +175,7 @@ template <>
 struct TensorCRemainderOp<half> {
   __device__ __forceinline__ void operator()(half* out, half* in) {
 #ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hsub(*out, __hmul(*in, hfloor(__hdiv(*out, *in))));
+    *out = __hsub(*out, __hmul(*in, hfloor(hdiv(*out, *in))));
 #else
     float fout = __half2float(*out);
     float fin = __half2float(*in);
@@ -460,7 +185,7 @@ struct TensorCRemainderOp<half> {
 
   __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
 #ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hsub(*in1, __hmul(*in2, hfloor(__hdiv(*in1, *in2))));
+    *out = __hsub(*in1, __hmul(*in2, hfloor(hdiv(*in1, *in2))));
 #else
     float fin1 = __half2float(*in1);
     float fin2 = __half2float(*in2);
@@ -472,102 +197,60 @@ struct TensorCRemainderOp<half> {
 
 template <typename T>
 struct TensorCFmodOp {
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *out % *in;
-  }
-
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in1 % *in2;
-  }
-};
-
-template <>
-struct TensorCFmodOp<float> {
-  __device__ __forceinline__ void operator()(float* out, float* in) {
-    *out = fmodf(*out, *in);
-  }
-
-  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
-    *out = fmodf(*in1, *in2);
-  }
-};
-
-template <>
-struct TensorCFmodOp<double> {
-  __device__ __forceinline__ void operator()(double* out, double* in) {
-    *out = fmod(*out, *in);
-  }
-
-  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
-    *out = fmod(*in1, *in2);
+    *out = N_::s_(N_::mod(*in1,*in2));
   }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorCFmodOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    *out = __float2half(fmodf(__half2float(*out), __half2float(*in)));
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    *out = __float2half(fmodf(__half2float(*in1), __half2float(*in2)));
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    this->operator()(out, out, in);
   }
 };
-#endif // CUDA_HALF_TENSOR
 
 template <typename T>
 struct TensorClampOp {
+  typedef THCNumerics<T> N_;
   TensorClampOp(T min, T max) : minValue(min), maxValue(max) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    T val = THCNumerics<T>::lt(*in, maxValue) ? *in : maxValue;
-    *out = THCNumerics<T>::gt(minValue, val) ? minValue : val;
+    T val = N_::lt(*in, maxValue) ? *in : maxValue;
+    *out = N_::gt(minValue, val) ? minValue : val;
   }
 
   __device__ __forceinline__ void operator()(T* v) {
-    T val = THCNumerics<T>::lt(*v, maxValue) ? *v : maxValue;
-    *v = THCNumerics<T>::gt(minValue, val) ? minValue : val;
+    T val = N_::lt(*v, maxValue) ? *v : maxValue;
+    *v = N_::gt(minValue, val) ? minValue : val;
   }
-
-  const T minValue;
-  const T maxValue;
+  const typename N_::storage_type minValue;
+  const typename N_::storage_type maxValue;
 };
 
 template <typename T>
 struct TensorLerpOp {
+  typedef THCNumerics<T> N_;
   TensorLerpOp(T w) : w(w) {}
-
   __device__ __forceinline__ void operator()(T *out, T *a, T *b) {
-    *out = THCNumerics<T>::add(
-      *a,
-      THCNumerics<T>::mul(
-          w,
-          THCNumerics<T>::sub(*b, *a)
-        )
-    );
+    *out = N_::add(*a, N_::mul(w, N_::sub(*b, *a)));
   }
-
-  const T w;
+  T w;
 };
 
 template <typename T>
 struct TensorCrossOp {
+  typedef THCNumerics<T> N_;
   TensorCrossOp(long sx, long sy, long so) : sx(sx), sy(sy), so(so) {}
-
   __device__ __forceinline__ void operator()(T* out, T* x, T*y) {
-    out[0 * so] = THCNumerics<T>::sub(
-        THCNumerics<T>::mul(x[1 * sx], y[2 * sy]),
-        THCNumerics<T>::mul(x[2 * sx], y[1 * sy])
+    out[0 * so] = N_::sub(
+        N_::mul(x[1 * sx], y[2 * sy]),
+        N_::mul(x[2 * sx], y[1 * sy])
     );
 
-    out[1 * so] = THCNumerics<T>::sub(
-        THCNumerics<T>::mul(x[2 * sx], y[0 * sy]),
-        THCNumerics<T>::mul(x[0 * sx], y[2 * sy])
+    out[1 * so] = N_::sub(
+        N_::mul(x[2 * sx], y[0 * sy]),
+        N_::mul(x[0 * sx], y[2 * sy])
     );
 
-    out[2 * so] = THCNumerics<T>::sub(
-        THCNumerics<T>::mul(x[0 * sx], y[1 * sy]),
-        THCNumerics<T>::mul(x[1 * sx], y[0 * sy])
+    out[2 * so] = N_::sub(
+        N_::mul(x[0 * sx], y[1 * sy]),
+        N_::mul(x[1 * sx], y[0 * sy])
     );
   }
 
@@ -576,36 +259,39 @@ struct TensorCrossOp {
 
 template <typename T>
 struct TensorMaxOp {
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = THCNumerics<T>::gt(*out, *in) ? *out : *in;
+    *out = N_::gt(*out, *in) ? *out : *in;
   }
 
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = THCNumerics<T>::gt(*in1, *in2) ? *in1 : *in2;
+    *out = N_::gt(*in1, *in2) ? *in1 : *in2;
   }
 };
 
 template <typename T>
 struct TensorMinOp {
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = THCNumerics<T>::lt(*out, *in) ? *out : *in;
+    *out = N_::lt(*out, *in) ? *out : *in;
   }
 
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = THCNumerics<T>::lt(*in1, *in2) ? *in1 : *in2;
+    *out = N_::lt(*in1, *in2) ? *in1 : *in2;
   }
 };
 
 template <typename T>
 struct TensorMaxValueOp {
+  typedef THCNumerics<T> N_;
   TensorMaxValueOp(T v) : val(v) {}
 
   __device__ __forceinline__ void operator()(T* out) {
-    *out = THCNumerics<T>::gt(*out, val) ? *out : val;
+    *out = N_::gt(*out, val) ? *out : val;
   }
 
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = THCNumerics<T>::gt(*in, val) ? *in : val;
+    *out = N_::gt(*in, val) ? *in : val;
   }
 
   T val;
@@ -613,51 +299,39 @@ struct TensorMaxValueOp {
 
 template <typename T>
 struct TensorMinValueOp {
+  typedef THCNumerics<T> N_;
   TensorMinValueOp(T v) : val(v) {}
-
   __device__ __forceinline__ void operator()(T* out) {
-    *out = THCNumerics<T>::lt(*out, val) ? *out : val;
+    *out = N_::lt(*out, val) ? *out : val;
   }
-
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = THCNumerics<T>::lt(*in, val) ? *in : val;
+    *out = N_::lt(*in, val) ? *in : val;
   }
-
   T val;
 };
 
 template <typename T>
 struct TensorAddCMulOp {
+  typedef THCNumerics<T> N_;
   TensorAddCMulOp(T v) : val(v) {}
-
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = THCNumerics<T>::add(
-      *out,
-      THCNumerics<T>::mul(
-        val,
-        THCNumerics<T>::mul(*in1, *in2)
-      )
-    );
+    *out = N_::add(*out,N_::mul(val,N_::mul(*in1, *in2)));
   }
-
   T val;
 };
 
 template <typename T>
 struct TensorAddCDivOp {
+  typedef THCNumerics<T> N_;
   TensorAddCDivOp(T v) : val(v) {}
-
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = THCNumerics<T>::add(
-      *out,
-      THCNumerics<T>::mul(
-        val,
-        THCNumerics<T>::div(*in1, *in2)
-      )
-    );
+    *out = N_::add( *out,
+                    N_::mul(val,
+                            N_::div(*in1, *in2)
+                            )
+                    );
   }
-
-  T val;
+  typename N_::storage_type val;
 };
 
 #endif // THC_TENSORMATH_POINTWISE_CUH
diff --git a/lib/THC/generic/THCTensorMathPairwise.cu b/lib/THC/generic/THCTensorMathPairwise.cu
index 0b4094b8..b7e82021 100644
--- a/lib/THC/generic/THCTensorMathPairwise.cu
+++ b/lib/THC/generic/THCTensorMathPairwise.cu
@@ -63,7 +63,7 @@ THC_API void
 THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
 {
   THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
-  THArgCheck(value != ScalarConvert<int, real>::to(0), 3, "divide by zero");
+  THArgCheck(value != THCNumConstants<real>::zero(), 3, "divide by zero");
 
   if (self_ == src_) {
     if (!THC_pointwiseApply1(state, self_, TensorDivConstantOp<real>(value))) {