Skip to content
This repository has been archived by the owner on Aug 3, 2021. It is now read-only.

Commit

Permalink
Merge pull request torch#533 from BTNC/replace-long
Browse files Browse the repository at this point in the history
replace long with ptrdiff_t for memory size/offset, element count
soumith authored Oct 8, 2016

Verified

This commit was signed with the committer’s verified signature.
2 parents 820becf + c1c5e58 commit 9efd392
Showing 37 changed files with 271 additions and 212 deletions.
4 changes: 2 additions & 2 deletions FFI.lua
Original file line number Diff line number Diff line change
@@ -50,7 +50,7 @@ cudaStream_t THCState_getCurrentStream(THCState *state);
typedef struct THCStorage
{
real *data;
long size;
ptrdiff_t size;
int refcount;
char flag;
THAllocator *allocator;
@@ -65,7 +65,7 @@ typedef struct THCTensor
int nDimension;
THCStorage *storage;
long storageOffset;
ptrdiff_t storageOffset;
int refcount;
char flag;
54 changes: 53 additions & 1 deletion TensorMath.lua
Original file line number Diff line number Diff line change
@@ -2,6 +2,58 @@ local wrap = require 'cwrap'

local interface = wrap.CInterface.new()
local method = wrap.CInterface.new()
local argtypes = wrap.CInterface.argtypes

argtypes['ptrdiff_t'] = {

helpname = function(arg)
return 'ptrdiff_t'
end,

declare = function(arg)
-- if it is a number we initialize here
local default = tonumber(tostring(arg.default)) or 0
return string.format("%s arg%d = %g;", 'ptrdiff_t', arg.i, default)
end,

check = function(arg, idx)
return string.format("lua_isinteger(L, %d)", idx)
end,

read = function(arg, idx)
return string.format("arg%d = (%s)lua_tointeger(L, %d);", arg.i, 'ptrdiff_t', idx)
end,

init = function(arg)
-- otherwise do it here
if arg.default then
local default = tostring(arg.default)
if not tonumber(default) then
return string.format("arg%d = %s;", arg.i, default)
end
end
end,

carg = function(arg)
return string.format('arg%d', arg.i)
end,

creturn = function(arg)
return string.format('arg%d', arg.i)
end,

precall = function(arg)
if arg.returned then
return string.format('lua_pushinteger(L, (lua_Integer)arg%d);', arg.i)
end
end,

postcall = function(arg)
if arg.creturned then
return string.format('lua_pushinteger(L, (lua_Integer)arg%d);', arg.i)
end
end
}

interface:print('/* WARNING: autogenerated file */')
interface:print('')
@@ -559,7 +611,7 @@ for k, Tensor_ in pairs(handledTypenames) do
wrap("numel",
cname("numel"),
{{name=Tensor},
{name="long", creturned=true}})
{name="ptrdiff_t", creturned=true}})

wrap("add",
cname("add"),
14 changes: 7 additions & 7 deletions generic/CTensor.c
Original file line number Diff line number Diff line change
@@ -173,16 +173,16 @@ void THFloatTensor_kernel_copy(float *dst,
long *dst_sz, long *dst_st, int dst_dim,
float *src,
long *src_sz, long *src_st, int src_dim,
long n_elem)
ptrdiff_t n_elem)
{
long k;
ptrdiff_t k;

for(k = 0; k < n_elem; k++)
{
long src_idx = 0;
long src_rest = k;
long dst_idx = 0;
long dst_rest = k;
ptrdiff_t src_idx = 0;
ptrdiff_t src_rest = k;
ptrdiff_t dst_idx = 0;
ptrdiff_t dst_rest = k;
int dim;

for(dim = 0; dim < dst_dim; dim++)
@@ -206,7 +206,7 @@ static int cuda_FloatTensor_fakecopy(lua_State *L)
THFloatTensor *self = luaT_checkudata(L, 1, "torch.FloatTensor");
THFloatTensor *src = luaT_checkudata(L, 2, "torch.FloatTensor");
long *d_self_sz, *d_self_st, *d_src_sz, *d_src_st;
long nElement = THFloatTensor_nElement(self);
ptrdiff_t nElement = THFloatTensor_nElement(self);

THArgCheck(THFloatTensor_nElement(self) == THFloatTensor_nElement(src), 2, "sizes do not match");

4 changes: 2 additions & 2 deletions lib/THC/THCAllocator.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "THCAllocator.h"

static void *THCudaHostAllocator_alloc(void* ctx, long size) {
static void *THCudaHostAllocator_alloc(void* ctx, ptrdiff_t size) {
void* ptr;

if (size < 0) THError("Invalid memory size: %ld", size);
@@ -18,7 +18,7 @@ static void THCudaHostAllocator_free(void* ctx, void* ptr) {
THCudaCheck(cudaFreeHost(ptr));
}

static void *THCudaHostAllocator_realloc(void* ctx, void* ptr, long size) {
static void *THCudaHostAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) {
if (size < 0) THError("Invalid memory size: %ld", size);

THCudaHostAllocator_free(ctx, ptr);
10 changes: 5 additions & 5 deletions lib/THC/THCApply.cuh
Original file line number Diff line number Diff line change
@@ -101,7 +101,7 @@ inline dim3 getApplyBlock() {
return dim3(THC_APPLY_THREADS_PER_BLOCK);
}

inline bool getApplyGrid(THCState* state, long totalElements, dim3& grid) {
inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
int curDevice = -1;
cudaGetDevice(&curDevice);

@@ -116,7 +116,7 @@ inline bool getApplyGrid(THCState* state, long totalElements, dim3& grid) {
// 16 warps per block * 4 per SM gives 64 warps per SM at maximum,
// which seems to be a good sweetspot for latency hiding
grid = dim3(min((long long) THCCeilDiv(totalElements,
(long) THC_APPLY_THREADS_PER_BLOCK),
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK),
4LL * numSM));
return true;
}
@@ -139,7 +139,7 @@ bool THC_pointwiseApply1(THCState* state,
const dim3 block = getApplyBlock();

dim3 grid;
long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);

if (!getApplyGrid(state, totalElements, grid)) {
return false;
@@ -253,7 +253,7 @@ bool THC_pointwiseApply2(THCState* state,
const Op& op,
TensorArgType aType = ReadWrite,
TensorArgType bType = ReadOnly) {
long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);

if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b)) {
return false;
@@ -431,7 +431,7 @@ bool THC_pointwiseApply3(THCState* state,
TensorArgType aType = ReadWrite,
TensorArgType bType = ReadOnly,
TensorArgType cType = ReadOnly) {
long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);

if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b) ||
totalElements != TensorUtils<TensorTypeC>::getNumElements(state, c)) {
4 changes: 2 additions & 2 deletions lib/THC/THCDeviceTensor-inl.cuh
Original file line number Diff line number Diff line change
@@ -103,9 +103,9 @@ THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {

template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ long
__host__ __device__ ptrdiff_t
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
long size = getSize(0);
ptrdiff_t size = getSize(0);

for (int i = 1; i < Dim; ++i) {
size *= getSize(i);
2 changes: 1 addition & 1 deletion lib/THC/THCDeviceTensor.cuh
Original file line number Diff line number Diff line change
@@ -142,7 +142,7 @@ class THCDeviceTensor {

/// Returns the total number of elements contained within our data
/// (product of `getSize(i)`)
__host__ __device__ long numElements() const;
__host__ __device__ ptrdiff_t numElements() const;

/// Returns the size array.
__host__ __device__ __forceinline__ const IndexT* sizes() const {
2 changes: 1 addition & 1 deletion lib/THC/THCDeviceTensorUtils-inl.cuh
Original file line number Diff line number Diff line change
@@ -10,7 +10,7 @@ toDeviceTensor(THCState* state, THCudaTensor* t) {

// Determine the maximum offset into the tensor achievable; `IndexT`
// must be smaller than this type in order to use it.
long maxOffset = 0;
ptrdiff_t maxOffset = 0;
IndexT sizes[Dim];
IndexT strides[Dim];

25 changes: 13 additions & 12 deletions lib/THC/THCGeneral.c
Original file line number Diff line number Diff line change
@@ -60,8 +60,8 @@ struct THCState {

void (*cutorchGCFunction)(void *data);
void *cutorchGCData;
long heapSoftmax;
long heapDelta;
ptrdiff_t heapSoftmax;
ptrdiff_t heapDelta;
};

THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
@@ -640,8 +640,9 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
}
}

static long heapSize = 0; // not thread-local
static const long heapMaxDelta = 1e6;
static ptrdiff_t heapSize = 0; // not thread-local
static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6;
static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6;
static const double heapSoftmaxGrowthThresh = 0.8; // grow softmax if >80% max after GC
static const double heapSoftmaxGrowthFactor = 1.4; // grow softmax by 40%

@@ -691,8 +692,8 @@ cudaError_t THCudaFree(THCState *state, void *ptr)
return allocator->free(allocator->state, ptr);
}

static long applyHeapDelta(THCState *state) {
long newHeapSize = THAtomicAddLong(&heapSize, state->heapDelta) + state->heapDelta;
static ptrdiff_t applyHeapDelta(THCState *state) {
ptrdiff_t newHeapSize = THAtomicAddPtrdiff(&heapSize, state->heapDelta) + state->heapDelta;
state->heapDelta = 0;
return newHeapSize;
}
@@ -701,27 +702,27 @@ static long applyHeapDelta(THCState *state) {
// When THC heap size goes above this softmax, the GC hook is triggered.
// If heap size is above 80% of the softmax after GC, then the softmax is
// increased.
static void maybeTriggerGC(THCState *state, long curHeapSize) {
static void maybeTriggerGC(THCState *state, ptrdiff_t curHeapSize) {
if (state->cutorchGCFunction != NULL && curHeapSize > state->heapSoftmax) {
(state->cutorchGCFunction)(state->cutorchGCData);

// ensure heapSize is accurate before updating heapSoftmax
long newHeapSize = applyHeapDelta(state);
ptrdiff_t newHeapSize = applyHeapDelta(state);

if (newHeapSize > state->heapSoftmax * heapSoftmaxGrowthThresh) {
state->heapSoftmax = state->heapSoftmax * heapSoftmaxGrowthFactor;
state->heapSoftmax = (ptrdiff_t)state->heapSoftmax * heapSoftmaxGrowthFactor;
}
}
}

void THCHeapUpdate(THCState *state, long size) {
void THCHeapUpdate(THCState *state, ptrdiff_t size) {
state->heapDelta += size;
// batch updates to global heapSize to minimize thread contention
if (labs(state->heapDelta) < heapMaxDelta) {
if (state->heapDelta < heapMaxDelta && state->heapDelta > heapMinDelta) {
return;
}

long newHeapSize = applyHeapDelta(state);
ptrdiff_t newHeapSize = applyHeapDelta(state);
if (size > 0) {
maybeTriggerGC(state, newHeapSize);
}
2 changes: 1 addition & 1 deletion lib/THC/THCGeneral.h.in
Original file line number Diff line number Diff line change
@@ -114,6 +114,6 @@ THC_API cudaError_t THCudaFree(THCState *state, void *ptr);
THC_API void THCSetGCHandler(THCState *state,
void (*torchGCHandlerFunction)(void *data),
void *data );
THC_API void THCHeapUpdate(THCState *state, long size);
THC_API void THCHeapUpdate(THCState *state, ptrdiff_t size);

#endif
4 changes: 2 additions & 2 deletions lib/THC/THCHalf.cu
Original file line number Diff line number Diff line change
@@ -10,7 +10,7 @@ struct __float2halfOp {
__device__ half operator()(float v) { return __float2half(v); }
};

void THCFloat2Half(THCState *state, half *out, float *in, long len) {
void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len) {
thrust::transform(
#if CUDA_VERSION >= 7000
thrust::cuda::par.on(THCState_getCurrentStream(state)),
@@ -20,7 +20,7 @@ void THCFloat2Half(THCState *state, half *out, float *in, long len) {
in, in + len, out, __float2halfOp());
}

void THCHalf2Float(THCState *state, float *out, half *in, long len) {
void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
thrust::transform(
#if CUDA_VERSION >= 7000
thrust::cuda::par.on(THCState_getCurrentStream(state)),
4 changes: 2 additions & 2 deletions lib/THC/THCHalf.h
Original file line number Diff line number Diff line change
@@ -18,8 +18,8 @@
#include <cuda_fp16.h>
#include <stdint.h>

THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, long len);
THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len);
THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len);
THC_API half THC_float2half(float a);
THC_API float THC_half2float(half a);

12 changes: 6 additions & 6 deletions lib/THC/THCReduce.cuh
Original file line number Diff line number Diff line change
@@ -123,7 +123,7 @@ inline dim3 getNoncontigReduceBlock() {
return dim3(THC_NONCONTIG_REDUCE_BLOCK_SIZE);
}

inline dim3 getContigReduceBlock(long numSlices, long reductionSize) {
inline dim3 getContigReduceBlock(ptrdiff_t numSlices, long reductionSize) {
// If the number of slices is low but the reduction dimension size
// is high, then we should increase block size for greater parallelism.
// Aim for at least 32 warps per SM (assume 15 SMs; don't bother
@@ -148,13 +148,13 @@ inline dim3 getContigReduceBlock(long numSlices, long reductionSize) {
return dim3(numWarps * 32);
}

inline bool getNoncontigReduceGrid(long elements, dim3& grid) {
inline bool getNoncontigReduceGrid(ptrdiff_t elements, dim3& grid) {
// One output point per thread
return THC_getGridFromTiles(THCCeilDiv(elements,
(long) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
(ptrdiff_t) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
}

inline bool getContigReduceGrid(long elements, dim3& grid) {
inline bool getContigReduceGrid(ptrdiff_t elements, dim3& grid) {
// One output point per block
return THC_getGridFromTiles(elements, grid);
}
@@ -169,11 +169,11 @@ bool THC_reduceDim(THCState* state,
const ReduceOp& reduceOp,
typename TensorUtils<TensorType>::DataType init,
int dim) {
long inElements = TensorUtils<TensorType>::getNumElements(state, in);
ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);

long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim);
long reductionStride = TensorUtils<TensorType>::getStride(state, in, dim);
long outElements = inElements / reductionSize;
ptrdiff_t outElements = inElements / reductionSize;

if (TensorUtils<TensorType>::getDims(state, out) > MAX_CUTORCH_DIMS ||
TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
Loading

0 comments on commit 9efd392

Please sign in to comment.