Merge pull request torch#533 from BTNC/replace-long

replace long with ptrdiff_t for memory size/offset, element count
NVIDIA · Oct 8, 2016 · 9efd392 · 9efd392
2 parents 820becf + c1c5e58
commit 9efd392
Showing 37 changed files with 271 additions and 212 deletions.
diff --git a/FFI.lua b/FFI.lua
@@ -50,7 +50,7 @@ cudaStream_t THCState_getCurrentStream(THCState *state);
 typedef struct THCStorage
 {
     real *data;
-    long size;
+    ptrdiff_t size;
     int refcount;
     char flag;
     THAllocator *allocator;
@@ -65,7 +65,7 @@ typedef struct THCTensor
     int nDimension;
 
     THCStorage *storage;
-    long storageOffset;
+    ptrdiff_t storageOffset;
     int refcount;
 
     char flag;

diff --git a/TensorMath.lua b/TensorMath.lua
@@ -2,6 +2,58 @@ local wrap = require 'cwrap'
 
 local interface = wrap.CInterface.new()
 local method = wrap.CInterface.new()
+local argtypes = wrap.CInterface.argtypes
+
+argtypes['ptrdiff_t'] = {
+
+  helpname = function(arg)
+                return 'ptrdiff_t'
+             end,
+
+  declare = function(arg)
+               -- if it is a number we initialize here
+               local default = tonumber(tostring(arg.default)) or 0
+               return string.format("%s arg%d = %g;", 'ptrdiff_t', arg.i, default)
+            end,
+
+  check = function(arg, idx)
+             return string.format("lua_isinteger(L, %d)", idx)
+          end,
+
+  read = function(arg, idx)
+            return string.format("arg%d = (%s)lua_tointeger(L, %d);", arg.i, 'ptrdiff_t', idx)
+         end,
+
+  init = function(arg)
+            -- otherwise do it here
+            if arg.default then
+               local default = tostring(arg.default)
+               if not tonumber(default) then
+                  return string.format("arg%d = %s;", arg.i, default)
+               end
+            end
+         end,
+
+  carg = function(arg)
+            return string.format('arg%d', arg.i)
+         end,
+
+  creturn = function(arg)
+               return string.format('arg%d', arg.i)
+            end,
+
+  precall = function(arg)
+               if arg.returned then
+                  return string.format('lua_pushinteger(L, (lua_Integer)arg%d);', arg.i)
+               end
+            end,
+
+  postcall = function(arg)
+                if arg.creturned then
+                   return string.format('lua_pushinteger(L, (lua_Integer)arg%d);', arg.i)
+                end
+             end
+}
 
 interface:print('/* WARNING: autogenerated file */')
 interface:print('')
@@ -559,7 +611,7 @@ for k, Tensor_ in pairs(handledTypenames) do
     wrap("numel",
          cname("numel"),
          {{name=Tensor},
-            {name="long", creturned=true}})
+            {name="ptrdiff_t", creturned=true}})
 
     wrap("add",
          cname("add"),

diff --git a/generic/CTensor.c b/generic/CTensor.c
@@ -173,16 +173,16 @@ void THFloatTensor_kernel_copy(float *dst,
                                          long *dst_sz, long *dst_st, int dst_dim,
                                          float *src,
                                          long *src_sz, long *src_st, int src_dim,
-                                         long n_elem)
+                                         ptrdiff_t n_elem)
 {
-  long k;
+  ptrdiff_t k;
 
   for(k = 0; k < n_elem; k++)
   {
-    long src_idx = 0;
-    long src_rest = k;
-    long dst_idx = 0;
-    long dst_rest = k;
+    ptrdiff_t src_idx = 0;
+    ptrdiff_t src_rest = k;
+    ptrdiff_t dst_idx = 0;
+    ptrdiff_t dst_rest = k;
     int dim;
 
     for(dim = 0; dim < dst_dim; dim++)
@@ -206,7 +206,7 @@ static int cuda_FloatTensor_fakecopy(lua_State *L)
   THFloatTensor *self = luaT_checkudata(L, 1, "torch.FloatTensor");
   THFloatTensor *src = luaT_checkudata(L, 2, "torch.FloatTensor");
   long *d_self_sz, *d_self_st, *d_src_sz, *d_src_st;
-  long nElement = THFloatTensor_nElement(self);
+  ptrdiff_t nElement = THFloatTensor_nElement(self);
 
   THArgCheck(THFloatTensor_nElement(self) == THFloatTensor_nElement(src), 2, "sizes do not match");
 

diff --git a/lib/THC/THCAllocator.c b/lib/THC/THCAllocator.c
@@ -1,6 +1,6 @@
 #include "THCAllocator.h"
 
-static void *THCudaHostAllocator_alloc(void* ctx, long size) {
+static void *THCudaHostAllocator_alloc(void* ctx, ptrdiff_t size) {
   void* ptr;
 
   if (size < 0) THError("Invalid memory size: %ld", size);
@@ -18,7 +18,7 @@ static void THCudaHostAllocator_free(void* ctx, void* ptr) {
   THCudaCheck(cudaFreeHost(ptr));
 }
 
-static void *THCudaHostAllocator_realloc(void* ctx, void* ptr, long size) {
+static void *THCudaHostAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) {
   if (size < 0) THError("Invalid memory size: %ld", size);
 
   THCudaHostAllocator_free(ctx, ptr);

diff --git a/lib/THC/THCApply.cuh b/lib/THC/THCApply.cuh
@@ -101,7 +101,7 @@ inline dim3 getApplyBlock() {
   return dim3(THC_APPLY_THREADS_PER_BLOCK);
 }
 
-inline bool getApplyGrid(THCState* state, long totalElements, dim3& grid) {
+inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
   int curDevice = -1;
   cudaGetDevice(&curDevice);
 
@@ -116,7 +116,7 @@ inline bool getApplyGrid(THCState* state, long totalElements, dim3& grid) {
   // 16 warps per block * 4 per SM gives 64 warps per SM at maximum,
   // which seems to be a good sweetspot for latency hiding
   grid = dim3(min((long long) THCCeilDiv(totalElements,
-                                         (long) THC_APPLY_THREADS_PER_BLOCK),
+                                         (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK),
                   4LL * numSM));
   return true;
 }
@@ -139,7 +139,7 @@ bool THC_pointwiseApply1(THCState* state,
   const dim3 block = getApplyBlock();
 
   dim3 grid;
-  long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
 
   if (!getApplyGrid(state, totalElements, grid)) {
     return false;
@@ -253,7 +253,7 @@ bool THC_pointwiseApply2(THCState* state,
                          const Op& op,
                          TensorArgType aType = ReadWrite,
                          TensorArgType bType = ReadOnly) {
-  long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
 
   if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b)) {
     return false;
@@ -431,7 +431,7 @@ bool THC_pointwiseApply3(THCState* state,
                          TensorArgType aType = ReadWrite,
                          TensorArgType bType = ReadOnly,
                          TensorArgType cType = ReadOnly) {
-  long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
 
   if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b) ||
       totalElements != TensorUtils<TensorTypeC>::getNumElements(state, c)) {

diff --git a/lib/THC/THCDeviceTensor-inl.cuh b/lib/THC/THCDeviceTensor-inl.cuh
@@ -103,9 +103,9 @@ THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {
 
 template <typename T, int Dim,
           typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ long
+__host__ __device__ ptrdiff_t
 THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
-  long size = getSize(0);
+  ptrdiff_t size = getSize(0);
 
   for (int i = 1; i < Dim; ++i) {
     size *= getSize(i);

diff --git a/lib/THC/THCDeviceTensor.cuh b/lib/THC/THCDeviceTensor.cuh
@@ -142,7 +142,7 @@ class THCDeviceTensor {
 
   /// Returns the total number of elements contained within our data
   /// (product of `getSize(i)`)
-  __host__ __device__ long numElements() const;
+  __host__ __device__ ptrdiff_t numElements() const;
 
   /// Returns the size array.
   __host__ __device__ __forceinline__ const IndexT* sizes() const {

diff --git a/lib/THC/THCDeviceTensorUtils-inl.cuh b/lib/THC/THCDeviceTensorUtils-inl.cuh
@@ -10,7 +10,7 @@ toDeviceTensor(THCState* state, THCudaTensor* t) {
 
   // Determine the maximum offset into the tensor achievable; `IndexT`
   // must be smaller than this type in order to use it.
-  long maxOffset = 0;
+  ptrdiff_t maxOffset = 0;
   IndexT sizes[Dim];
   IndexT strides[Dim];
 

diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c
@@ -60,8 +60,8 @@ struct THCState {
 
   void (*cutorchGCFunction)(void *data);
   void *cutorchGCData;
-  long heapSoftmax;
-  long heapDelta;
+  ptrdiff_t heapSoftmax;
+  ptrdiff_t heapDelta;
 };
 
 THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
@@ -640,8 +640,9 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
   }
 }
 
-static long heapSize = 0; // not thread-local
-static const long heapMaxDelta = 1e6;
+static ptrdiff_t heapSize = 0; // not thread-local
+static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6;
+static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6;
 static const double heapSoftmaxGrowthThresh = 0.8; // grow softmax if >80% max after GC
 static const double heapSoftmaxGrowthFactor = 1.4; // grow softmax by 40%
 
@@ -691,8 +692,8 @@ cudaError_t THCudaFree(THCState *state, void *ptr)
   return allocator->free(allocator->state, ptr);
 }
 
-static long applyHeapDelta(THCState *state) {
-  long newHeapSize = THAtomicAddLong(&heapSize, state->heapDelta) + state->heapDelta;
+static ptrdiff_t applyHeapDelta(THCState *state) {
+  ptrdiff_t newHeapSize = THAtomicAddPtrdiff(&heapSize, state->heapDelta) + state->heapDelta;
   state->heapDelta = 0;
   return newHeapSize;
 }
@@ -701,27 +702,27 @@ static long applyHeapDelta(THCState *state) {
 // When THC heap size goes above this softmax, the GC hook is triggered.
 // If heap size is above 80% of the softmax after GC, then the softmax is
 // increased.
-static void maybeTriggerGC(THCState *state, long curHeapSize) {
+static void maybeTriggerGC(THCState *state, ptrdiff_t curHeapSize) {
   if (state->cutorchGCFunction != NULL && curHeapSize > state->heapSoftmax) {
     (state->cutorchGCFunction)(state->cutorchGCData);
 
     // ensure heapSize is accurate before updating heapSoftmax
-    long newHeapSize = applyHeapDelta(state);
+    ptrdiff_t newHeapSize = applyHeapDelta(state);
 
     if (newHeapSize > state->heapSoftmax * heapSoftmaxGrowthThresh) {
-      state->heapSoftmax = state->heapSoftmax * heapSoftmaxGrowthFactor;
+      state->heapSoftmax = (ptrdiff_t)state->heapSoftmax * heapSoftmaxGrowthFactor;
     }
   }
 }
 
-void THCHeapUpdate(THCState *state, long size) {
+void THCHeapUpdate(THCState *state, ptrdiff_t size) {
   state->heapDelta += size;
   // batch updates to global heapSize to minimize thread contention
-  if (labs(state->heapDelta) < heapMaxDelta) {
+  if (state->heapDelta < heapMaxDelta && state->heapDelta > heapMinDelta) {
     return;
   }
 
-  long newHeapSize = applyHeapDelta(state);
+  ptrdiff_t newHeapSize = applyHeapDelta(state);
   if (size > 0) {
     maybeTriggerGC(state, newHeapSize);
   }

diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in
@@ -114,6 +114,6 @@ THC_API cudaError_t THCudaFree(THCState *state, void *ptr);
 THC_API void THCSetGCHandler(THCState *state,
                              void (*torchGCHandlerFunction)(void *data),
                              void *data );
-THC_API void THCHeapUpdate(THCState *state, long size);
+THC_API void THCHeapUpdate(THCState *state, ptrdiff_t size);
 
 #endif
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
@@ -10,7 +10,7 @@ struct __float2halfOp {
   __device__ half operator()(float v) { return __float2half(v); }
 };
 
-void THCFloat2Half(THCState *state, half *out, float *in, long len) {
+void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len) {
   thrust::transform(
 #if CUDA_VERSION >= 7000
     thrust::cuda::par.on(THCState_getCurrentStream(state)),
@@ -20,7 +20,7 @@ void THCFloat2Half(THCState *state, half *out, float *in, long len) {
     in, in + len, out, __float2halfOp());
 }
 
-void THCHalf2Float(THCState *state, float *out, half *in, long len) {
+void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
   thrust::transform(
 #if CUDA_VERSION >= 7000
     thrust::cuda::par.on(THCState_getCurrentStream(state)),

diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
@@ -18,8 +18,8 @@
 #include <cuda_fp16.h>
 #include <stdint.h>
 
-THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, long len);
-THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len);
+THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
+THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len);
 THC_API half THC_float2half(float a);
 THC_API float THC_half2float(half a);
 

diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh
@@ -123,7 +123,7 @@ inline dim3 getNoncontigReduceBlock() {
   return dim3(THC_NONCONTIG_REDUCE_BLOCK_SIZE);
 }
 
-inline dim3 getContigReduceBlock(long numSlices, long reductionSize) {
+inline dim3 getContigReduceBlock(ptrdiff_t numSlices, long reductionSize) {
   // If the number of slices is low but the reduction dimension size
   // is high, then we should increase block size for greater parallelism.
   // Aim for at least 32 warps per SM (assume 15 SMs; don't bother
@@ -148,13 +148,13 @@ inline dim3 getContigReduceBlock(long numSlices, long reductionSize) {
   return dim3(numWarps * 32);
 }
 
-inline bool getNoncontigReduceGrid(long elements, dim3& grid) {
+inline bool getNoncontigReduceGrid(ptrdiff_t elements, dim3& grid) {
   // One output point per thread
   return THC_getGridFromTiles(THCCeilDiv(elements,
-                                         (long) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
+                                         (ptrdiff_t) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
 }
 
-inline bool getContigReduceGrid(long elements, dim3& grid) {
+inline bool getContigReduceGrid(ptrdiff_t elements, dim3& grid) {
   // One output point per block
   return THC_getGridFromTiles(elements, grid);
 }
@@ -169,11 +169,11 @@ bool THC_reduceDim(THCState* state,
                    const ReduceOp& reduceOp,
                    typename TensorUtils<TensorType>::DataType init,
                    int dim) {
-  long inElements = TensorUtils<TensorType>::getNumElements(state, in);
+  ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);
 
   long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim);
   long reductionStride = TensorUtils<TensorType>::getStride(state, in, dim);
-  long outElements = inElements / reductionSize;
+  ptrdiff_t outElements = inElements / reductionSize;
 
   if (TensorUtils<TensorType>::getDims(state, out) > MAX_CUTORCH_DIMS ||
       TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {