Skip to content
This repository has been archived by the owner on Aug 3, 2021. It is now read-only.

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into spec_refactor
Browse files Browse the repository at this point in the history
Conflicts:
	CMakeLists.txt
	Tensor.lua
	generic/CStorage.c
	generic/CTensor.c
	init.c
	lib/THC/THCCachingAllocator.cpp
	lib/THC/THCGeneral.c
	lib/THC/THCGenerateHalfType.h
	lib/THC/THCHalf.cu
	lib/THC/THCStorageCopy.c
	lib/THC/THCStorageCopy.cu
	lib/THC/THCTensorCopy.c
	lib/THC/generic/THCStorageCopy.c
	lib/THC/generic/THCStorageCopy.cu
	lib/THC/generic/THCStorageCopy.h
	lib/THC/generic/THCTensorCopy.h
	test/test_shutdown.lua
  • Loading branch information
borisfom committed Jan 14, 2017
2 parents 03a77af + 16fb59f commit 5b7ed04
Show file tree
Hide file tree
Showing 65 changed files with 2,097 additions and 803 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ IF (NOT WIN32)
SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration ${CMAKE_C_FLAGS}")
ENDIF (NOT WIN32)
IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}")
SET(CMAKE_CXX_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_CXX_FLAGS}")
ADD_DEFINITIONS(-DTH_GENERIC_USE_HALF=1)
ADD_DEFINITIONS(-DCUDA_HAS_FP16=1)
ENDIF()

INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ Most other (besides float) CPU torch tensor types now have a cutorch equivalent,
### CUDA memory allocation
Set the environment variable `THC_CACHING_ALLOCATOR=1` to enable the caching CUDA memory allocator.

By default, cutorch calls `cudaMalloc` and `cudaFree` when CUDA tensors are allocated and freed. This is expensive because `cudaFree` synchronizes the CPU with the GPU. Setting `THC_CACHING_ALLOCATOR=1` will cause cutorch to cache and re-use CUDA allocations to avoid synchronizations.
By default, cutorch calls `cudaMalloc` and `cudaFree` when CUDA tensors are allocated and freed. This is expensive because `cudaFree` synchronizes the CPU with the GPU. Setting `THC_CACHING_ALLOCATOR=1` will cause cutorch to cache and re-use CUDA device and pinned memory allocations to avoid synchronizations.

With the caching memory allocator, allocations and frees should logically be considered "usages" of the memory segment associated with streams, just like kernel launches. The programmer must insert the proper synchronization if memory segments are used from multiple streams.
With the caching memory allocator, device allocations and frees should logically be considered "usages" of the memory segment associated with streams, just like kernel launches. The programmer must insert the proper synchronization if memory segments are used from multiple streams.

###`cutorch.*` API
- `cutorch.synchronize()` : All of the CUDA API is asynchronous (barring a few functions), which means that you can queue up operations. To wait for the operations to finish, you can issue `cutorch.synchronize()` in your code, when the code waits for all GPU operations on the current GPU to finish. WARNING: synchronizes the CPU host with respect to the current device (as per `cutorch.getDevice()`) only.
Expand Down
7 changes: 7 additions & 0 deletions Storage.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@
#define torch_Storage_(NAME) TH_CONCAT_4(torch_,CReal,Storage_,NAME)
#define torch_Storage TH_CONCAT_STRING_3(torch.,CReal,Storage)
#define cutorch_Storage_(NAME) TH_CONCAT_4(cutorch_,CReal,Storage_,NAME)
#define cutorch_StorageCopy_(NAME) TH_CONCAT_4(cutorch_,Real,StorageCopy_,NAME)

// generate the torch types -- we could also do this via THGenerateAllTypes,
// but this allows us to be self contained.
#define FORCE_TH_HALF
#include "generic/CStorageCopy.c"
#include "THCGenerateAllTypes.h"
#undef FORCE_TH_HALF
#include "generic/CStorage.c"
#include "THCGenerateAllTypes.h"

7 changes: 7 additions & 0 deletions Tensor.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@
#define torch_Tensor_(NAME) TH_CONCAT_4(torch_,CReal,Tensor_,NAME)
#define torch_Tensor TH_CONCAT_STRING_3(torch.,CReal,Tensor)
#define cutorch_Tensor_(NAME) TH_CONCAT_4(cutorch_,CReal,Tensor_,NAME)
#define cutorch_TensorCopy_(NAME) TH_CONCAT_4(cutorch_,Real,TensorCopy_,NAME)

// generate the torch types -- we could also do this via THGenerateAllTypes,
// but this allows us to be self contained.
#define FORCE_TH_HALF
#include "generic/CTensorCopy.c"
#include "THCGenerateAllTypes.h"
#undef FORCE_TH_HALF
#include "generic/CTensor.c"
#include "THCGenerateAllTypes.h"
8 changes: 4 additions & 4 deletions Tensor.lua
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,14 @@ local CudaTensorTypes = {
long = 'torch.CudaLongTensor'
}

if cutorch.hasHalf then
CudaTensorTypes['half'] = 'torch.CudaHalfTensor'
end

for ValueType, CudaTensorType in pairs(CudaTensorTypes) do
local function Tensor__totable(self)
local host_tensor = self[ValueType](self)
return host_tensor:totable()
end
rawset(torch.getmetatable(CudaTensorType), 'totable', Tensor__totable)
end

if cutorch.hasHalf then
CudaTensorTypes.half = 'torch.CudaHalfTensor'
end
60 changes: 37 additions & 23 deletions TensorMath.lua
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@ for k, Tensor_ in pairs(handledTypenames) do
{name=Tensor},
{name="boolean", creturned=true}})

for _, name in ipairs({"cmul", "cpow", "cdiv"}) do
for _, name in ipairs({"cmul", "cpow", "cdiv", "cremainder", "cfmod"}) do
wrap(name,
cname(name),
{{name=Tensor, default=true, returned=true, method={default='nil'}},
Expand Down Expand Up @@ -906,20 +906,27 @@ for k, Tensor_ in pairs(handledTypenames) do
{{name=Tensor, default=true, returned=true},
{name=Tensor},
{name=Tensor},
{name="index", default=lastdim(2)}},
{name="index", default=-1}},
cname("catArray"),
{{name=Tensor, default=true, returned=true},
{name=Tensor .. "Array"},
{name="index", default=lastdimarray(2)}})
{name="index", default=-1}})

for _,f in ipairs({{name='geometric'},
{name='bernoulli', a=0.5}}) do
wrap("geometric",
cname("geometric"),
{{name=Tensor, returned=true},
{name='double'}})

wrap(f.name,
cname(f.name),
{{name=Tensor, returned=true},
{name='double', default=f.a}})
end
wrap("bernoulli",
cname("bernoulli"),
{{name=Tensor, returned=true},
{name='double', default=0.5}},
cname("bernoulli_FloatTensor"),
{{name=Tensor, returned=true},
{name="CudaTensor"}},
cname("bernoulli_DoubleTensor"),
{{name=Tensor, returned=true},
{name="CudaDoubleTensor"}})

wrap("nonzero",
cname("nonzero"),
Expand Down Expand Up @@ -964,7 +971,7 @@ for k, Tensor_ in pairs(handledTypenames) do

wrap("multinomial",
cname("multinomial"),
{{name=Tensor, default=true, returned=true, method={default='nil'}},
{{name='CudaLongTensor', default=true, returned=true, method={default='nil'}},
{name=Tensor},
{name="int"},
{name="boolean", default=false}})
Expand Down Expand Up @@ -1450,7 +1457,7 @@ wrap("equal",
{name=Tensor},
{name="boolean", creturned=true}})

for _, name in ipairs({"cmul", "cpow", "cdiv"}) do
for _, name in ipairs({"cmul", "cpow", "cdiv", "cremainder", "cfmod"}) do
wrap(name,
cname(name),
{{name=Tensor, default=true, returned=true, method={default='nil'}},
Expand Down Expand Up @@ -1820,7 +1827,7 @@ wrap("randn",

wrap("multinomial",
cname("multinomial"),
{{name=Tensor, default=true, returned=true, method={default='nil'}},
{{name='CudaLongTensor', default=true, returned=true, method={default='nil'}},
{name=Tensor},
{name="int"},
{name="boolean", default=false}})
Expand Down Expand Up @@ -1857,25 +1864,32 @@ wrap("cat",
{{name=Tensor, default=true, returned=true},
{name=Tensor},
{name=Tensor},
{name="index", default=lastdim(2)}},
{name="index", default=-1}},
cname("catArray"),
{{name=Tensor, default=true, returned=true},
{name=Tensor .. "Array"},
{name="index", default=lastdimarray(2)}})
{name="index", default=-1}})

wrap("nonzero",
cname("nonzero"),
{{name="CudaLongTensor", default=true, returned=true},
{name=Tensor}})

for _,f in ipairs({{name='geometric'},
{name='bernoulli', a=0.5}}) do

wrap(f.name,
cname(f.name),
{{name=Tensor, returned=true},
{name=real, default=f.a}})
end
wrap("geometric",
cname("geometric"),
{{name=Tensor, returned=true},
{name='double'}})

wrap("bernoulli",
cname("bernoulli"),
{{name=Tensor, returned=true},
{name='double', default=0.5}},
cname("bernoulli_FloatTensor"),
{{name=Tensor, returned=true},
{name="CudaTensor"}},
cname("bernoulli_DoubleTensor"),
{{name=Tensor, returned=true},
{name="CudaDoubleTensor"}})

for _,f in ipairs({{name='uniform', a=0, b=1},
{name='normal', a=0, b=1},
Expand Down
69 changes: 16 additions & 53 deletions generic/CStorage.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,23 @@
#define THC_GENERIC_FILE "generic/CStorage.c"
#else

#include "THHalf.h"

/* everything is as the generic Storage.c, except few things (see below) */

// FixMe: Requires an unsafe conversion in that we convert from cutorch's 'half'
// to torch's THHalf. These types are required to be defined in the same way
// (is there some way to enforce this?)
#ifdef THC_REAL_IS_HALF
#define THFILE_REAL_CAST(x) (THHalf *)x
#else
#define THFILE_REAL_CAST(x) x
#endif

#define THFile_readRealRaw(file, data, size) \
{ \
real *fdata = (real*)THAlloc(sizeof(real)*size); \
TH_CONCAT_3(THFile_read,Real,Raw)(file, fdata, size); \
TH_CONCAT_3(THFile_read,Real,Raw)(file, THFILE_REAL_CAST(fdata), size); \
THCudaCheck(cudaMemcpy(data, fdata, size * sizeof(real), cudaMemcpyHostToDevice)); \
THFree(fdata); \
}
Expand All @@ -16,14 +27,15 @@
{ \
real *fdata = (real*)THAlloc(sizeof(real)*size); \
THCudaCheck(cudaMemcpy(fdata, data, size * sizeof(real), cudaMemcpyDeviceToHost)); \
TH_CONCAT_3(THFile_write,Real,Raw)(file, fdata, size); \
TH_CONCAT_3(THFile_write,Real,Raw)(file, THFILE_REAL_CAST(fdata), size); \
THFree(fdata); \
}

#define TH_GENERIC_FILE "generic/Storage.c"
#include "generic/Storage.c"

#undef TH_GENERIC_FILE
#undef THFILE_REAL_CAST
#undef THFile_readRealRaw
#undef THFile_writeRealRaw

Expand Down Expand Up @@ -76,51 +88,6 @@ static int cutorch_Storage_(copy)(lua_State *L)
return 1;
}

static int TH_CONCAT_3(cutorch_,Real,Storage_copy)(lua_State *L)
{
THStorage *storage = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Storage));
void *src;
if( (src = luaT_toudata(L, 2, TH_CONCAT_STRING_3(torch.,Real,Storage) )))
THStorage_(copy)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) )
THStorage_(copyByte)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) )
THStorage_(copyChar)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) )
THStorage_(copyShort)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) )
THStorage_(copyInt)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) )
THStorage_(copyLong)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) )
THStorage_(copyFloat)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
THStorage_(copyDouble)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaStorage")) )
THStorage_(copyCudaFloat)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaLongStorage")) )
THStorage_(copyCudaLong)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaByteStorage")) )
THStorage_(copyCudaByte)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaCharStorage")) )
THStorage_(copyCudaChar)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaShortStorage")) )
THStorage_(copyCudaShort)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaIntStorage")) )
THStorage_(copyCudaInt)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleStorage")) )
THStorage_(copyCudaDouble)(cutorch_getstate(L), storage, src);
#ifdef CUDA_HALF_TENSOR
else if( (src = luaT_toudata(L, 2, "torch.CudaHalfStorage")) )
THStorage_(copyCudaHalf)(cutorch_getstate(L), storage, src);
#endif
else
luaL_typerror(L, 2, "torch.*Storage");

lua_settop(L, 1);
return 1;
}

static int cutorch_Storage_(getDevice)(lua_State *L) {
THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
lua_pushinteger(L, THCStorage_(getDevice)(cutorch_getstate(L), storage) + 1);
Expand All @@ -132,12 +99,8 @@ void cutorch_Storage_(init)(lua_State* L)
/* the standard stuff */
torch_Storage_(init)(L);

// torch_Storage macro is defined in Storage.c produce the CudaTensor types
// so I have to construct the normal torch types by hand
luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Storage));
lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Storage_copy));
lua_setfield(L, -2, "copy");
lua_pop(L, 1);
// Register this even though it is generated elsewhere.
cutorch_StorageCopy_(init)(L);

luaT_pushmetatable(L, torch_Storage);
lua_pushcfunction(L, cutorch_Storage_(copy));
Expand Down
64 changes: 64 additions & 0 deletions generic/CStorageCopy.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/CStorageCopy.c"
#else

#include "THHalf.h"

static int TH_CONCAT_3(cutorch_,Real,Storage_copy)(lua_State *L)
{
THStorage *storage = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Storage));
void *src;
if( (src = luaT_toudata(L, 2, TH_CONCAT_STRING_3(torch.,Real,Storage) )))
THStorage_(copy)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) )
THStorage_(copyByte)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) )
THStorage_(copyChar)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) )
THStorage_(copyShort)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) )
THStorage_(copyInt)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) )
THStorage_(copyLong)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) )
THStorage_(copyFloat)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
THStorage_(copyDouble)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.HalfStorage")) )
THStorage_(copyHalf)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaStorage")) )
THStorage_(copyCudaFloat)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaLongStorage")) )
THStorage_(copyCudaLong)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaByteStorage")) )
THStorage_(copyCudaByte)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaCharStorage")) )
THStorage_(copyCudaChar)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaShortStorage")) )
THStorage_(copyCudaShort)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaIntStorage")) )
THStorage_(copyCudaInt)(cutorch_getstate(L), storage, src);
else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleStorage")) )
THStorage_(copyCudaDouble)(cutorch_getstate(L), storage, src);
#ifdef CUDA_HALF_TENSOR
else if( (src = luaT_toudata(L, 2, "torch.CudaHalfStorage")) )
THStorage_(copyCudaHalf)(cutorch_getstate(L), storage, src);
#endif
else
luaL_typerror(L, 2, "torch.*Storage");

lua_settop(L, 1);
return 1;
}

void cutorch_StorageCopy_(init)(lua_State* L)
{
// torch_Storage macro is defined in Storage.c produce the CudaTensor types
// so I have to construct the normal torch types by hand
luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Storage));
lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Storage_copy));
lua_setfield(L, -2, "copy");
lua_pop(L, 1);
}

#endif
Loading

0 comments on commit 5b7ed04

Please sign in to comment.