From a72f68e8a9dc811bf462c861f8cca9623ec139ff Mon Sep 17 00:00:00 2001 From: Howard Su Date: Mon, 24 Oct 2016 23:38:24 +0000 Subject: [PATCH 1/8] we can use _NPROCESSOR_CONF instead of a hack for tegra --- rocks/cutorch-1.0-0.rockspec | 10 ++-------- rocks/cutorch-scm-1.rockspec | 10 ++-------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/rocks/cutorch-1.0-0.rockspec b/rocks/cutorch-1.0-0.rockspec index 593d2a94..07e309e5 100644 --- a/rocks/cutorch-1.0-0.rockspec +++ b/rocks/cutorch-1.0-0.rockspec @@ -22,16 +22,10 @@ build = { type = "command", build_command = [[ -isTegra=$(uname -a | grep -E '(tegra|aarch)' | wc | awk '{print $1'}) -if [ "1" -eq "$isTegra" ] - then - jopts=3 - else - jopts=$(getconf _NPROCESSORS_ONLN) -fi +jopts=$(getconf _NPROCESSORS_CONF) echo "Building on $jopts cores" -cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(jopts) install +cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install ]], platforms = { windows = { diff --git a/rocks/cutorch-scm-1.rockspec b/rocks/cutorch-scm-1.rockspec index ca4c6a8a..83143859 100644 --- a/rocks/cutorch-scm-1.rockspec +++ b/rocks/cutorch-scm-1.rockspec @@ -21,16 +21,10 @@ build = { type = "command", build_command = [[ -isTegra=$(uname -a | grep -E '(tegra|aarch)' | wc | awk '{print $1'}) -if [ "1" -eq "$isTegra" ] - then - jopts=3 - else - jopts=$(getconf _NPROCESSORS_ONLN) -fi +jopts=$(getconf _NPROCESSORS_CONF) echo "Building on $jopts cores" -cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(jopts) install +cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install ]], platforms = { windows = { From dce6cd89aa78d308dd4360484ac21da3fc80c9d0 Mon Sep 17 00:00:00 2001 From: soumith Date: Mon, 24 Oct 2016 17:16:17 -0700 Subject: [PATCH 2/8] some bugfixes for THC --- lib/THC/THCAllocator.c | 14 +------------- lib/THC/THCGeneral.c | 14 +++++++++----- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/lib/THC/THCAllocator.c b/lib/THC/THCAllocator.c index 1bed0fb4..fa55c404 100644 --- a/lib/THC/THCAllocator.c +++ b/lib/THC/THCAllocator.c @@ -18,20 +18,8 @@ static void THCudaHostAllocator_free(void* ctx, void* ptr) { THCudaCheck(cudaFreeHost(ptr)); } -static void *THCudaHostAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) { - if (size < 0) THError("Invalid memory size: %ld", size); - - THCudaHostAllocator_free(ctx, ptr); - - if (size == 0) return NULL; - - THCudaCheck(cudaMallocHost(&ptr, size)); - - return ptr; -} - void THCAllocator_init(THAllocator *cudaHostAllocator) { cudaHostAllocator->malloc = &THCudaHostAllocator_alloc; - cudaHostAllocator->realloc = &THCudaHostAllocator_realloc; + cudaHostAllocator->realloc = NULL; cudaHostAllocator->free = &THCudaHostAllocator_free; } diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c index 2d478de6..9b4764db 100644 --- a/lib/THC/THCGeneral.c +++ b/lib/THC/THCGeneral.c @@ -261,17 +261,21 @@ void THCudaEnablePeerToPeerAccess(THCState* state) if (access) { cudaError_t err = cudaDeviceEnablePeerAccess(j, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { + /* It is possible that another thread has already enabled access. */ /* Any future call to cudaGetLastError will now return an error, */ /* even though we've already dealt with this specific error here. */ /* Call cudaGetLastError once to reset the last error state. */ cudaGetLastError(); - continue; - } - /* In case there are unknown errors returned from the above */ - THCudaCheck(err); + /* The above should have cleared status */ + THCudaCheck(cudaGetLastError()); + } else { + /* In case there are other unhandled errors returned from the */ + /* above */ + THCudaCheck(err); + } - /* Access could be enabled */ + /* Access could be enabled, or was already enabled */ state->p2pAccessEnabled[i][j] = 1; } } From 94232e233cb1934ea23119d16abb7416ce9c98e9 Mon Sep 17 00:00:00 2001 From: soumith Date: Mon, 24 Oct 2016 22:06:00 -0700 Subject: [PATCH 3/8] allocator updates --- lib/THC/CMakeLists.txt | 2 -- lib/THC/THCAllocator.c | 10 +++--- lib/THC/THCAllocator.h | 2 +- lib/THC/THCCachingAllocator.cpp | 1 + lib/THC/THCGeneral.c | 6 +++- lib/THC/THCGeneral.h.in | 3 +- lib/THC/generic/THCStorage.c | 63 ++++++++++++++++++++++----------- lib/THC/generic/THCStorage.cu | 32 ++++++++++++++--- lib/THC/generic/THCStorage.h | 13 ++++--- 9 files changed, 93 insertions(+), 39 deletions(-) diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt index edc0af07..244568fb 100644 --- a/lib/THC/CMakeLists.txt +++ b/lib/THC/CMakeLists.txt @@ -123,10 +123,8 @@ ELSE() ENDIF() SET(src - THCAllocator.c THCCachingAllocator.cpp THCGeneral.c - THCStorage.c THCStorageCopy.c THCStream.c THCTensor.c diff --git a/lib/THC/THCAllocator.c b/lib/THC/THCAllocator.c index fa55c404..5d36d4c2 100644 --- a/lib/THC/THCAllocator.c +++ b/lib/THC/THCAllocator.c @@ -1,6 +1,6 @@ #include "THCAllocator.h" -static void *THCudaHostAllocator_alloc(void* ctx, ptrdiff_t size) { +static void *THCudaHostAllocator_malloc(void* ctx, ptrdiff_t size) { void* ptr; if (size < 0) THError("Invalid memory size: %ld", size); @@ -18,8 +18,8 @@ static void THCudaHostAllocator_free(void* ctx, void* ptr) { THCudaCheck(cudaFreeHost(ptr)); } -void THCAllocator_init(THAllocator *cudaHostAllocator) { - cudaHostAllocator->malloc = &THCudaHostAllocator_alloc; - cudaHostAllocator->realloc = NULL; - cudaHostAllocator->free = &THCudaHostAllocator_free; +void THCAllocator_init(THCState *state) { + state->cudaHostAllocator->malloc = &THCudaHostAllocator_malloc; + state->cudaHostAllocator->realloc = NULL; + state->cudaHostAllocator->free = &THCudaHostAllocator_free; } diff --git a/lib/THC/THCAllocator.h b/lib/THC/THCAllocator.h index 3481304a..2f85eecb 100644 --- a/lib/THC/THCAllocator.h +++ b/lib/THC/THCAllocator.h @@ -3,6 +3,6 @@ #include "THCGeneral.h" -THC_API void THCAllocator_init(THAllocator *state); +THC_API void THCAllocator_init(THCState *state); #endif diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp index 73b81f63..54db20dd 100644 --- a/lib/THC/THCCachingAllocator.cpp +++ b/lib/THC/THCCachingAllocator.cpp @@ -300,6 +300,7 @@ static cudaError_t THCCachingAllocator_emptyCache(void* ctx) static THCCachingAllocator caching_allocator; static THCDeviceAllocator device_allocator = { &THCCachingAllocator_malloc, + NULL, &THCCachingAllocator_free, &THCCachingAllocator_emptyCache, &caching_allocator diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c index 9b4764db..0b753999 100644 --- a/lib/THC/THCGeneral.c +++ b/lib/THC/THCGeneral.c @@ -93,6 +93,7 @@ static cudaError_t cudaFreeWrapper(void* ctx, void* devPtr) static THCDeviceAllocator defaultDeviceAllocator = { &cudaMallocWrapper, + NULL, &cudaFreeWrapper, NULL, NULL @@ -129,7 +130,7 @@ void THCudaInit(THCState* state) THCRandom_init(state, numDevices, device); state->cudaHostAllocator = (THAllocator*)malloc(sizeof(THAllocator)); - THCAllocator_init(state->cudaHostAllocator); + THCAllocator_init(state); /* Enable P2P access between all pairs, if possible */ THCudaEnablePeerToPeerAccess(state); @@ -792,3 +793,6 @@ void THCHeapUpdate(THCState *state, ptrdiff_t size) { } #undef GLOBAL_SCRATCH_SPACE_PER_SM_STREAM + +#include "THCStorage.c" +#include "THCAllocator.c" diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in index 91351674..8b3ac74d 100644 --- a/lib/THC/THCGeneral.h.in +++ b/lib/THC/THCGeneral.h.in @@ -43,7 +43,8 @@ struct THCRNGState; /* Random number generator state. */ struct THCStream; typedef struct _THCDeviceAllocator { - cudaError_t (*malloc)(void*, void**, size_t, cudaStream_t); + cudaError_t (*malloc)( void*, void**, size_t, cudaStream_t); + cudaError_t (*realloc)(void*, void**, size_t, size_t, cudaStream_t); cudaError_t (*free)(void*, void*); cudaError_t (*emptyCache)(void*); void* state; diff --git a/lib/THC/generic/THCStorage.c b/lib/THC/generic/THCStorage.c index ad685262..e51d1eef 100644 --- a/lib/THC/generic/THCStorage.c +++ b/lib/THC/generic/THCStorage.c @@ -20,53 +20,64 @@ int THCStorage_(elementSize)(THCState *state) void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real value) { THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); - THCudaCheck(cudaMemcpy(self->data + index, &value, sizeof(real), cudaMemcpyHostToDevice)); + THCudaCheck(cudaMemcpy(self->data + index, &value, sizeof(real), + cudaMemcpyHostToDevice)); } real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index) { THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); real value; - THCudaCheck(cudaMemcpy(&value, self->data + index, sizeof(real), cudaMemcpyDeviceToHost)); + THCudaCheck(cudaMemcpy(&value, self->data + index, sizeof(real), + cudaMemcpyDeviceToHost)); return value; } THCStorage* THCStorage_(new)(THCState *state) { - THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); - storage->data = NULL; - storage->size = 0; - storage->refcount = 1; - storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM; - return storage; + return THCStorage_(newWithSize)(state, 0); } THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size) +{ + return THCStorage_(newWithAllocator)( + state, size, + state->cudaDeviceAllocator, + state->cudaDeviceAllocator->state); +} + +THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size, + THCDeviceAllocator* allocator, + void* allocatorContext) { THArgCheck(size >= 0, 2, "invalid size"); + THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); + memset(storage, 0, sizeof(THCStorage)); + storage->refcount = 1; + storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM; + storage->allocator = allocator; + storage->allocatorContext = allocatorContext; + storage->size = size; + if(size > 0) { - THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); - // update heap *before* attempting malloc, to free space for the malloc THCHeapUpdate(state, size * sizeof(real)); cudaError_t err = - THCudaMalloc(state, (void**)&(storage->data), size * sizeof(real)); + (*allocator->malloc)(allocatorContext, (void**)&(storage->data), + size * sizeof(real), + THCState_getCurrentStream(state)); if(err != cudaSuccess){ THCHeapUpdate(state, -size * sizeof(real)); + free(storage); } THCudaCheck(err); - storage->size = size; - storage->refcount = 1; - storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM; - return storage; - } - else - { - return THCStorage_(new)(state); + } else { + storage->data = NULL; } + return storage; } THCStorage* THCStorage_(newWithSize1)(THCState *state, real data0) @@ -111,11 +122,22 @@ THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *fileName, p THCStorage* THCStorage_(newWithData)(THCState *state, real *data, ptrdiff_t size) { + return THCStorage_(newWithDataAndAllocator)(state, data, size, + state->cudaDeviceAllocator, + state->cudaDeviceAllocator->state); +} + +THCStorage* THCStorage_(newWithDataAndAllocator)( + THCState *state, real *data, long size, + THCDeviceAllocator *allocator, void *allocatorContext) { THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); + memset(storage, 0, sizeof(THCStorage)); storage->data = data; storage->size = size; storage->refcount = 1; storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM; + storage->allocator = allocator; + storage->allocatorContext = allocatorContext; return storage; } @@ -144,7 +166,8 @@ void THCStorage_(free)(THCState *state, THCStorage *self) { if(self->flag & TH_STORAGE_FREEMEM) { THCHeapUpdate(state, -self->size * sizeof(real)); - THCudaCheck(THCudaFree(state, self->data)); + THCudaCheck( + (*self->allocator->free)(self->allocatorContext, self->data)); } THFree(self); } diff --git a/lib/THC/generic/THCStorage.cu b/lib/THC/generic/THCStorage.cu index 63bccd7d..bdef7d36 100644 --- a/lib/THC/generic/THCStorage.cu +++ b/lib/THC/generic/THCStorage.cu @@ -15,14 +15,31 @@ void THCStorage_(fill)(THCState *state, THCStorage *self, real value) void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) { THArgCheck(size >= 0, 2, "invalid size"); + THAssert(self->allocator != NULL); if(!(self->flag & TH_STORAGE_RESIZABLE)) THError("Trying to resize storage that is not resizable"); + if (self->allocator->realloc) { + THCHeapUpdate(state, (size - self->size) * sizeof(real)); + cudaError_t err = (*self->allocator->realloc)( + self->allocatorContext, + (void**)&(self->data), + self->size * sizeof(real), + size * sizeof(real), THCState_getCurrentStream(state)); + if (err != cudaSuccess) { + THCHeapUpdate(state, (self->size - size) * sizeof(real)); + THCudaCheck(err); + } + self->size = size; + return; + } + if(size == 0) { if(self->flag & TH_STORAGE_FREEMEM) { - THCudaCheck(THCudaFree(state, self->data)); + THCudaCheck( + (*self->allocator->free)(self->allocatorContext, self->data)); THCHeapUpdate(state, -self->size * sizeof(real)); } self->data = NULL; @@ -33,7 +50,11 @@ void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) real *data = NULL; // update heap *before* attempting malloc, to free space for the malloc THCHeapUpdate(state, size * sizeof(real)); - cudaError_t err = THCudaMalloc(state, (void**)(&data), size * sizeof(real)); + cudaError_t err = + (*self->allocator->malloc)(self->allocatorContext, + (void**)&(data), + size * sizeof(real), + THCState_getCurrentStream(state)); if(err != cudaSuccess) { THCHeapUpdate(state, -size * sizeof(real)); } @@ -45,8 +66,11 @@ void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) THMin(self->size, size) * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state))); - THCudaCheck(THCudaFree(state, self->data)); - THCHeapUpdate(state, -self->size * sizeof(real)); + if(self->flag & TH_STORAGE_FREEMEM) { + THCudaCheck( + (*self->allocator->free)(self->allocatorContext, self->data)); + THCHeapUpdate(state, -self->size * sizeof(real)); + } } self->data = data; diff --git a/lib/THC/generic/THCStorage.h b/lib/THC/generic/THCStorage.h index a46caad8..f621c20f 100644 --- a/lib/THC/generic/THCStorage.h +++ b/lib/THC/generic/THCStorage.h @@ -12,7 +12,7 @@ typedef struct THCStorage ptrdiff_t size; int refcount; char flag; - THAllocator *allocator; + THCDeviceAllocator *allocator; void *allocatorContext; struct THCStorage *view; } THCStorage; @@ -37,11 +37,14 @@ THC_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *fil /* takes ownership of data */ THC_API THCStorage* THCStorage_(newWithData)(THCState *state, real *data, ptrdiff_t size); -THC_API THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size, - THAllocator* allocator, - void *allocatorContext); +THC_API THCStorage* THCStorage_(newWithAllocator)( + THCState *state, ptrdiff_t size, + THCDeviceAllocator* allocator, + void *allocatorContext); THC_API THCStorage* THCStorage_(newWithDataAndAllocator)( - THCState *state, real* data, ptrdiff_t size, THAllocator* allocator, void *allocatorContext); + THCState *state, real* data, ptrdiff_t size, + THCDeviceAllocator* allocator, + void *allocatorContext); THC_API void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag); THC_API void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char flag); From 69c9454c07a6da74ab469bb9866c17656fbed8a2 Mon Sep 17 00:00:00 2001 From: soumith Date: Tue, 25 Oct 2016 07:21:54 -0700 Subject: [PATCH 4/8] Store the device in THCStorage --- FFI.lua | 1 + lib/THC/generic/THCStorage.c | 12 ++++++++++++ lib/THC/generic/THCStorage.cu | 10 ++++++---- lib/THC/generic/THCStorage.h | 1 + 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/FFI.lua b/FFI.lua index f347a89d..b2777a2b 100644 --- a/FFI.lua +++ b/FFI.lua @@ -63,6 +63,7 @@ typedef struct THCStorage THAllocator *allocator; void *allocatorContext; struct THCStorage *view; + int device; } THCStorage; typedef struct THCTensor diff --git a/lib/THC/generic/THCStorage.c b/lib/THC/generic/THCStorage.c index e51d1eef..eb4777cd 100644 --- a/lib/THC/generic/THCStorage.c +++ b/lib/THC/generic/THCStorage.c @@ -51,6 +51,8 @@ THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size, void* allocatorContext) { THArgCheck(size >= 0, 2, "invalid size"); + int device; + THCudaCheck(cudaGetDevice(&device)); THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); memset(storage, 0, sizeof(THCStorage)); @@ -59,6 +61,7 @@ THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size, storage->allocator = allocator; storage->allocatorContext = allocatorContext; storage->size = size; + storage->device = device; if(size > 0) { @@ -138,6 +141,15 @@ THCStorage* THCStorage_(newWithDataAndAllocator)( storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM; storage->allocator = allocator; storage->allocatorContext = allocatorContext; + int device; + if (data) { + struct cudaPointerAttributes attr; + THCudaCheck(cudaPointerGetAttributes(&attr, data)); + device = attr.device; + } else { + THCudaCheck(cudaGetDevice(&device)); + } + storage->device = device; return storage; } diff --git a/lib/THC/generic/THCStorage.cu b/lib/THC/generic/THCStorage.cu index bdef7d36..22c900ab 100644 --- a/lib/THC/generic/THCStorage.cu +++ b/lib/THC/generic/THCStorage.cu @@ -16,6 +16,8 @@ void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) { THArgCheck(size >= 0, 2, "invalid size"); THAssert(self->allocator != NULL); + int device; + THCudaCheck(cudaGetDevice(&device)); if(!(self->flag & TH_STORAGE_RESIZABLE)) THError("Trying to resize storage that is not resizable"); @@ -32,6 +34,7 @@ void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) THCudaCheck(err); } self->size = size; + self->device = device; return; } @@ -44,6 +47,7 @@ void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) } self->data = NULL; self->size = 0; + self->device = device; } else { @@ -75,14 +79,12 @@ void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) self->data = data; self->size = size; + self->device = device; } } THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) { - if (!storage->data) return -1; - cudaPointerAttributes attr; - THCudaCheck(cudaPointerGetAttributes(&attr, storage->data)); - return attr.device; + return storage->device; } #endif diff --git a/lib/THC/generic/THCStorage.h b/lib/THC/generic/THCStorage.h index f621c20f..e768ec6f 100644 --- a/lib/THC/generic/THCStorage.h +++ b/lib/THC/generic/THCStorage.h @@ -15,6 +15,7 @@ typedef struct THCStorage THCDeviceAllocator *allocator; void *allocatorContext; struct THCStorage *view; + int device; } THCStorage; From 64f974178c03c93666cfe3796b7e2d7b549476a2 Mon Sep 17 00:00:00 2001 From: soumith Date: Tue, 25 Oct 2016 18:23:53 -0700 Subject: [PATCH 5/8] pushing THCState back to the header --- lib/THC/CMakeLists.txt | 2 ++ lib/THC/THCGeneral.c | 54 -------------------------------------- lib/THC/THCGeneral.h.in | 58 +++++++++++++++++++++++++++++++++++++++-- lib/THC/THCStream.h | 4 +-- 4 files changed, 60 insertions(+), 58 deletions(-) diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt index 244568fb..b0813450 100644 --- a/lib/THC/CMakeLists.txt +++ b/lib/THC/CMakeLists.txt @@ -113,6 +113,7 @@ ELSE() SET(THC_INSTALL_CMAKE_SUBDIR ${Torch_INSTALL_CMAKE_SUBDIR}) ENDIF() +INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}") INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}") CONFIGURE_FILE(THCGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THCGeneral.h") @@ -200,6 +201,7 @@ INSTALL(FILES THCStorage.h THCStorageCopy.h THCStream.h + THCThreadLocal.h THCTensor.h THCTensorCopy.h THCTensorRandom.h diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c index 0b753999..0a1d3404 100644 --- a/lib/THC/THCGeneral.c +++ b/lib/THC/THCGeneral.c @@ -12,60 +12,6 @@ #define GLOBAL_SCRATCH_SPACE_PER_SM_STREAM 4 * sizeof(float) -typedef struct _THCCudaResourcesPerDevice { - THCStream** streams; - cublasHandle_t* blasHandles; - /* Size of scratch space per each stream on this device available */ - size_t scratchSpacePerStream; - /* Device-resident scratch space per stream, used for global memory - reduction kernels. */ - void** devScratchSpacePerStream; -} THCCudaResourcesPerDevice; - -struct THCState { - struct THCRNGState* rngState; - struct cudaDeviceProp* deviceProperties; - /* Set of all allocated resources. resourcePerDevice[dev]->streams[0] is NULL, - which specifies the per-device default stream. blasHandles do not have a - default and must be explicitly initialized. We always initialize 1 - blasHandle but we can use more. - */ - THCCudaResourcesPerDevice* resourcesPerDevice; - /* Captured number of devices upon startup; convenience for bounds checking */ - int numDevices; - /* Number of Torch defined resources available, indices 1 ... numStreams */ - int numUserStreams; - int numUserBlasHandles; - - /* Allocator using cudaMallocHost. */ - THAllocator* cudaHostAllocator; - THCDeviceAllocator* cudaDeviceAllocator; - - /* Index of the current selected BLAS handle. The actual BLAS handle used - depends on the current device. */ - THCThreadLocal/**/ currentPerDeviceBlasHandle; - /* Array of thread locals containing the current stream for each device */ - THCThreadLocal* currentStreams; - - /* Table of enabled peer-to-peer access between directed pairs of GPUs. - If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */ - int** p2pAccessEnabled; - - /* Is direct cross-kernel p2p access allowed? Normally, only cross-GPU - copies are allowed via p2p if p2p access is enabled at all for - the pair of GPUs in question, but if this flag is true, then - all cross-GPU access checks are disabled, allowing kernels to - directly access memory on another GPUs. - Note that p2p access must exist and be enabled for the pair of - GPUs in question. */ - int p2pKernelAccessEnabled; - - void (*cutorchGCFunction)(void *data); - void *cutorchGCData; - ptrdiff_t heapSoftmax; - ptrdiff_t heapDelta; -}; - THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr( THCState *state, int device); diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in index 8b3ac74d..22aab031 100644 --- a/lib/THC/THCGeneral.h.in +++ b/lib/THC/THCGeneral.h.in @@ -3,6 +3,7 @@ #include "THGeneral.h" #include "THAllocator.h" +#include "THCThreadLocal.h" #undef log1p #include "cuda.h" @@ -40,7 +41,8 @@ #endif struct THCRNGState; /* Random number generator state. */ -struct THCStream; +typedef struct THCStream THCStream; +typedef struct THCState THCState; typedef struct _THCDeviceAllocator { cudaError_t (*malloc)( void*, void**, size_t, cudaStream_t); @@ -50,9 +52,61 @@ typedef struct _THCDeviceAllocator { void* state; } THCDeviceAllocator; +typedef struct _THCCudaResourcesPerDevice { + THCStream** streams; + cublasHandle_t* blasHandles; + /* Size of scratch space per each stream on this device available */ + size_t scratchSpacePerStream; + /* Device-resident scratch space per stream, used for global memory + reduction kernels. */ + void** devScratchSpacePerStream; +} THCCudaResourcesPerDevice; + /* Global state to be held in the cutorch table. */ -typedef struct THCState THCState; +struct THCState { + struct THCRNGState* rngState; + struct cudaDeviceProp* deviceProperties; + /* Set of all allocated resources. resourcePerDevice[dev]->streams[0] is NULL, + which specifies the per-device default stream. blasHandles do not have a + default and must be explicitly initialized. We always initialize 1 + blasHandle but we can use more. + */ + THCCudaResourcesPerDevice* resourcesPerDevice; + /* Captured number of devices upon startup; convenience for bounds checking */ + int numDevices; + /* Number of Torch defined resources available, indices 1 ... numStreams */ + int numUserStreams; + int numUserBlasHandles; + + /* Allocator using cudaMallocHost. */ + THAllocator* cudaHostAllocator; + THCDeviceAllocator* cudaDeviceAllocator; + + /* Index of the current selected BLAS handle. The actual BLAS handle used + depends on the current device. */ + THCThreadLocal/**/ currentPerDeviceBlasHandle; + /* Array of thread locals containing the current stream for each device */ + THCThreadLocal* currentStreams; + + /* Table of enabled peer-to-peer access between directed pairs of GPUs. + If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */ + int** p2pAccessEnabled; + + /* Is direct cross-kernel p2p access allowed? Normally, only cross-GPU + copies are allowed via p2p if p2p access is enabled at all for + the pair of GPUs in question, but if this flag is true, then + all cross-GPU access checks are disabled, allowing kernels to + directly access memory on another GPUs. + Note that p2p access must exist and be enabled for the pair of + GPUs in question. */ + int p2pKernelAccessEnabled; + + void (*cutorchGCFunction)(void *data); + void *cutorchGCData; + ptrdiff_t heapSoftmax; + ptrdiff_t heapDelta; +}; THC_API THCState* THCState_alloc(); THC_API void THCState_free(THCState* state); diff --git a/lib/THC/THCStream.h b/lib/THC/THCStream.h index 7e4bb49d..de3f64ed 100644 --- a/lib/THC/THCStream.h +++ b/lib/THC/THCStream.h @@ -4,12 +4,12 @@ #include #include "THCGeneral.h" -typedef struct THCStream +struct THCStream { cudaStream_t stream; int device; int refcount; -} THCStream; +}; THC_API THCStream* THCStream_new(int flags); From 2b2133a7d7246cbaea4c88048c8da27536830a72 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Wed, 26 Oct 2016 12:43:46 -0700 Subject: [PATCH 6/8] Use 'void' for no-arg functions --- lib/THC/THCCachingAllocator.cpp | 2 +- lib/THC/THCCachingAllocator.h | 2 +- lib/THC/THCGeneral.c | 2 +- lib/THC/THCGeneral.h.in | 2 +- lib/THC/THCThreadLocal.c | 2 +- lib/THC/THCThreadLocal.h | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp index 54db20dd..294b99d7 100644 --- a/lib/THC/THCCachingAllocator.cpp +++ b/lib/THC/THCCachingAllocator.cpp @@ -306,7 +306,7 @@ static THCDeviceAllocator device_allocator = { &caching_allocator }; -THC_API THCDeviceAllocator* THCCachingAllocator_get() +THC_API THCDeviceAllocator* THCCachingAllocator_get(void) { return &device_allocator; } diff --git a/lib/THC/THCCachingAllocator.h b/lib/THC/THCCachingAllocator.h index 711b1da9..5d80bd5b 100644 --- a/lib/THC/THCCachingAllocator.h +++ b/lib/THC/THCCachingAllocator.h @@ -3,6 +3,6 @@ #include "THCGeneral.h" -THC_API THCDeviceAllocator* THCCachingAllocator_get(); +THC_API THCDeviceAllocator* THCCachingAllocator_get(void); #endif diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c index 0a1d3404..5feb8ec8 100644 --- a/lib/THC/THCGeneral.c +++ b/lib/THC/THCGeneral.c @@ -15,7 +15,7 @@ THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr( THCState *state, int device); -THCState* THCState_alloc() +THCState* THCState_alloc(void) { THCState* state = (THCState*) malloc(sizeof(THCState)); memset(state, 0, sizeof(THCState)); diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in index 22aab031..c50cc1c7 100644 --- a/lib/THC/THCGeneral.h.in +++ b/lib/THC/THCGeneral.h.in @@ -108,7 +108,7 @@ struct THCState { ptrdiff_t heapDelta; }; -THC_API THCState* THCState_alloc(); +THC_API THCState* THCState_alloc(void); THC_API void THCState_free(THCState* state); THC_API void THCudaInit(THCState* state); diff --git a/lib/THC/THCThreadLocal.c b/lib/THC/THCThreadLocal.c index a46bf1eb..26a4093f 100644 --- a/lib/THC/THCThreadLocal.c +++ b/lib/THC/THCThreadLocal.c @@ -2,7 +2,7 @@ #include "THCGeneral.h" -THCThreadLocal THCThreadLocal_alloc() +THCThreadLocal THCThreadLocal_alloc(void) { #ifndef _WIN32 pthread_key_t key; diff --git a/lib/THC/THCThreadLocal.h b/lib/THC/THCThreadLocal.h index 22d970a4..78438214 100644 --- a/lib/THC/THCThreadLocal.h +++ b/lib/THC/THCThreadLocal.h @@ -9,7 +9,7 @@ typedef DWORD THCThreadLocal; typedef pthread_key_t THCThreadLocal; #endif -THCThreadLocal THCThreadLocal_alloc(); +THCThreadLocal THCThreadLocal_alloc(void); void THCThreadLocal_free(THCThreadLocal local); void* THCThreadLocal_get(THCThreadLocal local); void THCThreadLocal_set(THCThreadLocal local, void* value); From 18383174c82fbfd08a9bd8b84639aeec2e4ee22a Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Fri, 28 Oct 2016 17:13:04 -0700 Subject: [PATCH 7/8] checkpoint --- CMakeLists.txt | 2 +- Tensor.lua | 9 +--- generic/CStorage.c | 24 +-------- generic/CTensor.c | 11 ---- lib/THC/CMakeLists.txt | 2 +- lib/THC/THCGenerateHalfType.h | 1 - lib/THC/THCHalf.cu | 87 +++---------------------------- lib/THC/THCHalf.h | 7 +-- lib/THC/THCStorage.c | 2 - lib/THC/THCStorage.h | 2 +- lib/THC/THCTensorTypeUtils.cuh | 2 +- lib/THC/generic/THCStorageCopy.c | 75 +++++++++++++------------- lib/THC/generic/THCStorageCopy.cu | 24 +++++---- lib/THC/generic/THCStorageCopy.h | 10 ++-- torch/generic/Tensor.c | 2 +- 15 files changed, 77 insertions(+), 183 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 96224c44..c3698e68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ IF (NOT WIN32) SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration ${CMAKE_C_FLAGS}") ENDIF (NOT WIN32) IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) - SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}") + SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}") ENDIF() INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) diff --git a/Tensor.lua b/Tensor.lua index 76c87295..0636d682 100644 --- a/Tensor.lua +++ b/Tensor.lua @@ -24,6 +24,7 @@ end local TensorTypes = { float = 'torch.FloatTensor', + half = 'torch.HalfTensor', double = 'torch.DoubleTensor', byte = 'torch.ByteTensor', char = 'torch.CharTensor', @@ -85,11 +86,5 @@ for ValueType, CudaTensorType in pairs(CudaTensorTypes) do end if cutorch.hasHalf then - do - local function Tensor__totable(self) - local host_tensor = self:float() - return self:float():totable() - end - rawset(torch.getmetatable('torch.CudaHalfTensor'), 'totable', Tensor__totable) - end + CudaTensorTypes.half = 'torch.CudaHalfTensor' end diff --git a/generic/CStorage.c b/generic/CStorage.c index a6503f28..d6d4db57 100644 --- a/generic/CStorage.c +++ b/generic/CStorage.c @@ -6,7 +6,6 @@ /* everything is as the generic Storage.c, except few things (see below) */ -#ifndef THC_REAL_IS_HALF #define THFile_readRealRaw(file, data, size) \ { \ real *fdata = (real*)THAlloc(sizeof(real)*size); \ @@ -22,23 +21,6 @@ TH_CONCAT_3(THFile_write,Real,Raw)(file, fdata, size); \ THFree(fdata); \ } -#else -#define THFile_readRealRaw(file, data, size) \ - { \ - real *fdata = (real*)THAlloc(sizeof(real)*size); \ - THFile_readCharRaw(file, (char *)fdata, sizeof(real) * size); \ - THCudaCheck(cudaMemcpy(data, fdata, size * sizeof(real), cudaMemcpyHostToDevice)); \ - THFree(fdata); \ - } - -#define THFile_writeRealRaw(file, data, size) \ - { \ - real *fdata = (real*)THAlloc(sizeof(real)*size); \ - THCudaCheck(cudaMemcpy(fdata, data, size * sizeof(real), cudaMemcpyDeviceToHost)); \ - THFile_writeCharRaw(file, (char *)fdata, size * sizeof(real)); \ - THFree(fdata); \ - } -#endif #define TH_GENERIC_FILE "generic/Storage.c" #include "generic/Storage.c" @@ -87,6 +69,8 @@ static int cutorch_Storage_(copy)(lua_State *L) THCStorage_(copyFloat)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) ) THCStorage_(copyDouble)(state, storage, src); + else if( (src = luaT_toudata(L, 2, "torch.HalfStorage")) ) + THCStorage_(copyHalf)(state, storage, src); else luaL_typerror(L, 2, "torch.*Storage"); @@ -94,7 +78,6 @@ static int cutorch_Storage_(copy)(lua_State *L) return 1; } -#ifndef THC_REAL_IS_HALF static int TH_CONCAT_3(cutorch_,Real,Storage_copy)(lua_State *L) { THStorage *storage = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Storage)); @@ -139,7 +122,6 @@ static int TH_CONCAT_3(cutorch_,Real,Storage_copy)(lua_State *L) lua_settop(L, 1); return 1; } -#endif static int cutorch_Storage_(getDevice)(lua_State *L) { THCStorage *storage = luaT_checkudata(L, 1, torch_Storage); @@ -154,12 +136,10 @@ void cutorch_Storage_(init)(lua_State* L) // torch_Storage macro is defined in Storage.c produce the CudaTensor types // so I have to construct the normal torch types by hand -#ifndef THC_REAL_IS_HALF luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Storage)); lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Storage_copy)); lua_setfield(L, -2, "copy"); lua_pop(L, 1); -#endif luaT_pushmetatable(L, torch_Storage); lua_pushcfunction(L, cutorch_Storage_(copy)); diff --git a/generic/CTensor.c b/generic/CTensor.c index a9663ff1..64ac76a9 100644 --- a/generic/CTensor.c +++ b/generic/CTensor.c @@ -56,7 +56,6 @@ static int cutorch_Tensor_(copy)(lua_State *L) return 1; } -#ifndef THC_REAL_IS_HALF static int cutorch_Tensor_(copyAsyncCPU)(lua_State *L) { #define STRINGIFY_TENSOR(x) TH_CONCAT_STRING_3(torch.,x,Tensor) @@ -74,10 +73,7 @@ static int cutorch_Tensor_(copyAsyncCPU)(lua_State *L) return 1; #undef STRINGIFY_TENSOR } -#endif - -#ifndef THC_REAL_IS_HALF static int TH_CONCAT_3(cutorch_,Real,Tensor_copy)(lua_State *L) { THTensor *tensor = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Tensor)); @@ -122,9 +118,7 @@ static int TH_CONCAT_3(cutorch_,Real,Tensor_copy)(lua_State *L) lua_settop(L, 1); return 1; } -#endif -#ifndef THC_REAL_IS_HALF static int TH_CONCAT_3(cutorch_,Real,Tensor_copyAsyncCuda)(lua_State *L) { #define STRINGIFY_TENSOR(x) TH_CONCAT_STRING_3(torch.,x,Tensor) @@ -139,9 +133,6 @@ static int TH_CONCAT_3(cutorch_,Real,Tensor_copyAsyncCuda)(lua_State *L) return 1; #undef STRINGIFY_TENSOR } -#endif - - #ifdef THC_REAL_IS_FLOAT static void THFloatTensor_computesz(THFloatTensor *self, long **sz_, long **st_) @@ -248,7 +239,6 @@ void cutorch_Tensor_(init)(lua_State* L) lua_pop(L, 1); #endif -#ifndef THC_REAL_IS_HALF luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Tensor)); lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Tensor_copy)); lua_setfield(L, -2, "copy"); @@ -264,7 +254,6 @@ void cutorch_Tensor_(init)(lua_State* L) lua_pushcfunction(L, cutorch_Tensor_(copyAsyncCPU)); lua_setfield(L, -2, "copyAsync"); lua_pop(L, 1); -#endif luaT_pushmetatable(L, torch_Tensor); lua_pushcfunction(L, cutorch_Tensor_(copy)); diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt index b0813450..c82e4326 100644 --- a/lib/THC/CMakeLists.txt +++ b/lib/THC/CMakeLists.txt @@ -174,7 +174,7 @@ IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor") LIST(APPEND src-cuda THCHalf.cu) LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1") - SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}") + SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}") ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor") ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) diff --git a/lib/THC/THCGenerateHalfType.h b/lib/THC/THCGenerateHalfType.h index ac592162..501239d7 100644 --- a/lib/THC/THCGenerateHalfType.h +++ b/lib/THC/THCGenerateHalfType.h @@ -5,7 +5,6 @@ #include "THCHalf.h" #ifdef CUDA_HALF_TENSOR - #define real half #define accreal float #define Real Half diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu index 96397ffb..c574bf8d 100644 --- a/lib/THC/THCHalf.cu +++ b/lib/THC/THCHalf.cu @@ -32,92 +32,17 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) { float THC_half2float(half a) { - unsigned int bits = a.x & 0x7fff; - unsigned int sign = a.x & 0x8000; - unsigned int exp = a.x & 0x7c00; - - bits <<= 13; - sign <<= 16; - - bits += 0x38000000U; - - // flush denormals to 0 - bits = (exp == 0 ? 0 : bits) | sign; - - union { - float f; - unsigned int v; - } conv; - conv.v = bits; - - return conv.f; + TH_half h; + h.x = a.x; + return TH_half2float(h); } -/* - Copyright (c) 2015, Norbert Juffa - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - half THC_float2half(float a) { - uint32_t ia; - uint16_t ir; - memcpy(&ia, &a, sizeof(float)); - - ir = (ia >> 16) & 0x8000; - if ((ia & 0x7f800000) == 0x7f800000) { - if ((ia & 0x7fffffff) == 0x7f800000) { - ir |= 0x7c00; /* infinity */ - } else { - ir = 0x7fff; /* canonical NaN */ - } - } else if ((ia & 0x7f800000) >= 0x33000000) { - int shift = (int)((ia >> 23) & 0xff) - 127; - if (shift > 15) { - ir |= 0x7c00; /* infinity */ - } else { - ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */ - if (shift < -14) { /* denormal */ - ir |= ia >> (-1 - shift); - ia = ia << (32 - (-1 - shift)); - } else { /* normal */ - ir |= ia >> (24 - 11); - ia = ia << (32 - (24 - 11)); - ir = ir + ((14 + shift) << 10); - } - /* IEEE-754 round to nearest of even */ - if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) { - ir++; - } - } - } - half ret; - memcpy(&ret, &ir, sizeof(half)); - return ret; + TH_half th_res = TH_float2half(a); + ret.x = th_res.x ; + return ret ; } THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) { diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index 7c055e7a..dc77584b 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -8,9 +8,8 @@ #define CUDA_HALF_TENSOR 1 #endif -#ifdef CUDA_HALF_TENSOR +#include "THHalf.h" -#include #include THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len); @@ -24,6 +23,8 @@ THC_API int THC_nativeHalfInstructions(THCState *state); /* Check for performant native fp16 support on the current device */ THC_API int THC_fastHalfInstructions(THCState *state); -#endif /* CUDA_HALF_TENSOR */ +# undef TH_GENERIC_USE_HALF +# define TH_GENERIC_USE_HALF 1 + #endif diff --git a/lib/THC/THCStorage.c b/lib/THC/THCStorage.c index 669efa82..6fc9574e 100644 --- a/lib/THC/THCStorage.c +++ b/lib/THC/THCStorage.c @@ -2,7 +2,5 @@ #include "THCGeneral.h" #include "THAtomic.h" -#include "THCHalf.h" - #include "generic/THCStorage.c" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCStorage.h b/lib/THC/THCStorage.h index ac1cd709..025a3187 100644 --- a/lib/THC/THCStorage.h +++ b/lib/THC/THCStorage.h @@ -1,8 +1,8 @@ #ifndef THC_STORAGE_INC #define THC_STORAGE_INC +#include "THCHalf.h" #include "THStorage.h" -#include "THCGeneral.h" #define THCStorage TH_CONCAT_3(TH,CReal,Storage) #define THCStorage_(NAME) TH_CONCAT_4(TH,CReal,Storage_,NAME) diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh index 81051f75..25472960 100644 --- a/lib/THC/THCTensorTypeUtils.cuh +++ b/lib/THC/THCTensorTypeUtils.cuh @@ -71,7 +71,7 @@ TENSOR_UTILS(THCudaCharTensor, char, long); TENSOR_UTILS(THCudaShortTensor, short, long); TENSOR_UTILS(THCudaIntTensor, int, long); TENSOR_UTILS(THCudaLongTensor, long, long); -TENSOR_UTILS(THCudaTensor, float, float); +TENSOR_UTILS(THCudaTensor, float, double); TENSOR_UTILS(THCudaDoubleTensor, double, double); #ifdef CUDA_HALF_TENSOR diff --git a/lib/THC/generic/THCStorageCopy.c b/lib/THC/generic/THCStorageCopy.c index af5dbcc2..bf7a628f 100644 --- a/lib/THC/generic/THCStorageCopy.c +++ b/lib/THC/generic/THCStorageCopy.c @@ -2,40 +2,36 @@ #define THC_GENERIC_FILE "generic/THCStorageCopy.c" #else -#ifndef THC_REAL_IS_HALF void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src) { THArgCheck(self->size == src->size, 2, "size does not match"); THCudaCheck(cudaMemcpy(self->data, src->data, self->size * sizeof(real), cudaMemcpyHostToDevice)); } -#endif #ifndef THC_REAL_IS_HALF -#define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC) \ - void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src) \ - { \ - if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ - THCStorage_(copyCPU)(state, self, (THStorage*) src); /* cast just removes compiler warning */ \ - } else { \ - THStorage *buffer; \ - THArgCheck(self->size == src->size, 2, "size does not match"); \ - buffer = THStorage_(newWithSize)(src->size); \ +#define TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX(TYPEC) \ + THStorage *buffer= THStorage_(newWithSize)(src->size); \ THStorage_(copy##TYPEC)(buffer, src); \ THCStorage_(copyCPU)(state, self, buffer); \ - THStorage_(free)(buffer); \ - } \ - } + THStorage_(free)(buffer); #else +#define TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX(TYPEC) \ + THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \ + THCudaStorage_copy##TYPEC(state, buffer, src); \ + THCFloat2Half(state, self->data, buffer->data, src->size); \ + THCudaStorage_free(state, buffer); +#endif + #define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC) \ void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src) \ { \ + if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ + THCStorage_(copyCPU)(state, self, (THStorage*) src); /* cast just removes compiler warning */ \ + } else { \ THArgCheck(self->size == src->size, 2, "size does not match"); \ - THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \ - THCudaStorage_copy##TYPEC(state, buffer, src); \ - THCFloat2Half(state, self->data, buffer->data, src->size); \ - THCudaStorage_free(state, buffer); \ + TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX(TYPEC) \ + } \ } -#endif TH_CUDA_STORAGE_IMPLEMENT_COPY(Byte) TH_CUDA_STORAGE_IMPLEMENT_COPY(Char) @@ -43,52 +39,57 @@ TH_CUDA_STORAGE_IMPLEMENT_COPY(Short) TH_CUDA_STORAGE_IMPLEMENT_COPY(Int) TH_CUDA_STORAGE_IMPLEMENT_COPY(Long) TH_CUDA_STORAGE_IMPLEMENT_COPY(Float) +#ifdef CUDA_HALF_TENSOR +TH_CUDA_STORAGE_IMPLEMENT_COPY(Half) +#endif TH_CUDA_STORAGE_IMPLEMENT_COPY(Double) -#ifndef THC_REAL_IS_HALF void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src) { THArgCheck(self->size == src->size, 2, "size does not match"); THCudaCheck(cudaMemcpy(self->data, src->data, self->size * sizeof(real), cudaMemcpyDeviceToHost)); } -#endif #ifndef THC_REAL_IS_HALF -#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC) \ - void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \ - { \ - if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ - THStorage_(copyCuda)(state, (THStorage*) self, src); /* cast just removes compiler warnings */ \ - } else { \ +#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX(TYPEC) \ THStorage *buffer; \ - THArgCheck(self->size == src->size, 2, "size does not match"); \ buffer = THStorage_(newWithSize)(src->size); \ THStorage_(copyCuda)(state, buffer, src); \ TH_CONCAT_4(TH,TYPEC,Storage_copy,Real)(self, buffer); \ - THStorage_(free)(buffer); \ - } \ - } + THStorage_(free)(buffer); #else -#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC) \ - void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \ - { \ - THArgCheck(self->size == src->size, 2, "size does not match"); \ +#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX(TYPEC) \ THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size);\ THCHalf2Float(state, buffer->data, src->data, src->size); \ TH_CONCAT_3(TH,TYPEC,Storage_copyCudaFloat)(state, self, buffer); \ - THCudaStorage_free(state, buffer); \ - } + THCudaStorage_free(state, buffer); #endif +#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC) \ + void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \ + { \ + if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ + THStorage_(copyCuda)(state, (THStorage*) self, src); /* cast just removes compiler warnings */ \ + } else { \ + THArgCheck(self->size == src->size, 2, "size does not match"); \ + TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX(TYPEC) \ + } \ + } + TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Byte) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Char) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Short) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Int) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Long) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Float) +#ifdef CUDA_HALF_TENSOR +TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Half) +#endif TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Double) #undef TH_CUDA_STORAGE_IMPLEMENT_COPY #undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO +#undef TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX +#undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX #endif diff --git a/lib/THC/generic/THCStorageCopy.cu b/lib/THC/generic/THCStorageCopy.cu index 298f7179..1b224c43 100644 --- a/lib/THC/generic/THCStorageCopy.cu +++ b/lib/THC/generic/THCStorageCopy.cu @@ -43,14 +43,18 @@ void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src) void THCStorage_(copyCuda##TYPEC)(THCState *state, THCStorage *self, struct THCuda##TYPECUDA##Storage *src) \ { \ THArgCheck(self->size == src->size, 2, "size does not match"); \ - if(THCTypeIdx_(TYPEC) == THCTypeIdxFloat) { \ - THCFloat2Half(state, self->data, (float*) src->data, src->size); /* cast removes compiler error */ \ - } else { \ - THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \ - THCudaStorage_copyCuda##TYPEC(state, buffer, src); \ - THCFloat2Half(state, self->data, buffer->data, buffer->size); \ - THCudaStorage_free(state, buffer); \ - } \ + if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ + THCStorage_(copy)(state, self, (THCStorage*) src); /* cast just removes compiler warning */ \ + } else { \ + if(THCTypeIdx_(TYPEC) == THCTypeIdxFloat) { \ + THCFloat2Half(state, self->data, (float*) src->data, src->size); /* cast removes compiler error */ \ + } else { \ + THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \ + THCudaStorage_copyCuda##TYPEC(state, buffer, src); \ + THCFloat2Half(state, self->data, buffer->data, buffer->size); \ + THCudaStorage_free(state, buffer); \ + } \ + } \ } #endif @@ -62,7 +66,7 @@ THC_CUDA_STORAGE_IMPLEMENT_COPY(Long,Long) THC_CUDA_STORAGE_IMPLEMENT_COPY(Float,) // i.e. float THC_CUDA_STORAGE_IMPLEMENT_COPY(Double,Double) -#ifdef CUDA_HALF_TENSOR +#if defined (CUDA_HALF_TENSOR) #define FLOAT_COPY(TYPE) TH_CONCAT_3(TH, CReal, Storage_copyCudaFloat) void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaHalfStorage *src) { @@ -76,7 +80,7 @@ void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaH THCudaStorage_free(state, buffer); } } -#undef FLOAT_COPY +# undef FLOAT_COPY #endif // CUDA_HALF_TENSOR #undef THC_CUDA_STORAGE_IMPLEMENT_COPY diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h index c3e56013..7f1ccb08 100644 --- a/lib/THC/generic/THCStorageCopy.h +++ b/lib/THC/generic/THCStorageCopy.h @@ -13,6 +13,7 @@ THC_API void THCStorage_(copyInt)(THCState *state, THCStorage *storage, struct T THC_API void THCStorage_(copyLong)(THCState *state, THCStorage *storage, struct THLongStorage *src); THC_API void THCStorage_(copyFloat)(THCState *state, THCStorage *storage, struct THFloatStorage *src); THC_API void THCStorage_(copyDouble)(THCState *state, THCStorage *storage, struct THDoubleStorage *src); +THC_API void THCStorage_(copyHalf)(THCState *state, THCStorage *storage, struct THHalfStorage *src); THC_API void THCStorage_(copyCudaByte)(THCState *state, THCStorage *storage, struct THCudaByteStorage *src); THC_API void THCStorage_(copyCudaChar)(THCState *state, THCStorage *storage, struct THCudaCharStorage *src); @@ -21,7 +22,8 @@ THC_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, stru THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src); THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src); THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src); -#ifdef CUDA_HALF_TENSOR + +#if 0 /* def CUDA_HALF_TENSOR */ THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src); #endif @@ -32,12 +34,12 @@ THC_API void TH_CONCAT_2(THIntStorage_copyCuda , Real)(THCState *state, THIntS THC_API void TH_CONCAT_2(THLongStorage_copyCuda , Real)(THCState *state, THLongStorage *self, struct THCStorage *src); THC_API void TH_CONCAT_2(THFloatStorage_copyCuda , Real)(THCState *state, THFloatStorage *self, struct THCStorage *src); THC_API void TH_CONCAT_2(THDoubleStorage_copyCuda, Real)(THCState *state, THDoubleStorage *self, struct THCStorage *src); +#ifdef CUDA_HALF_TENSOR +THC_API void TH_CONCAT_2(THHalfStorage_copyCuda, Real)(THCState *state, THHalfStorage *self, struct THCStorage *src); +#endif -/* There is no THHalfStorage */ -#ifndef THC_REAL_IS_HALF THC_API void THStorage_(copyCuda)(THCState *state, THStorage *self, THCStorage *src); THC_API void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src); THC_API void THCStorage_(copyCPU)(THCState *state, THCStorage *self, THStorage *src); -#endif #endif diff --git a/torch/generic/Tensor.c b/torch/generic/Tensor.c index d7dcd53b..53b082c0 100644 --- a/torch/generic/Tensor.c +++ b/torch/generic/Tensor.c @@ -143,7 +143,7 @@ static int torch_Tensor_(new)(lua_State *L) luaL_error(L, "invalid element (not a number)"); } -#ifdef THC_REAL_IS_HALF +#ifndef THC_HALF half value = THC_float2half((float) lua_tonumber(L, -1)); #else real value = (real) lua_tonumber(L, -1); From 10ef05657e6f800127797e620c1ad2ced596d33e Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Sat, 29 Oct 2016 15:54:48 -0700 Subject: [PATCH 8/8] Using half from TH --- generic/CStorage.c | 2 -- generic/CTensor.c | 2 -- init.c | 5 ++--- lib/THC/THCAtomics.cuh | 2 +- lib/THC/THCBlas.cu | 2 -- lib/THC/THCBlas.h | 1 - lib/THC/THCGeneral.h.in | 10 ++++------ lib/THC/THCGenerateHalfType.h | 2 +- lib/THC/THCHalf.cu | 17 +---------------- lib/THC/THCHalf.h | 11 +++++++---- lib/THC/THCNumerics.cuh | 3 ++- lib/THC/THCStorage.cu | 2 -- lib/THC/THCStorage.h | 3 ++- lib/THC/THCStorageCopy.c | 3 --- lib/THC/THCStorageCopy.cu | 3 +-- lib/THC/THCStorageCopy.h | 1 - lib/THC/THCTensor.h | 3 ++- lib/THC/THCTensorCopy.c | 4 +--- lib/THC/THCTensorCopy.cu | 1 - lib/THC/THCTensorCopy.h | 1 - lib/THC/THCTensorIndex.cu | 6 +++--- lib/THC/THCTensorMath.h | 1 - lib/THC/THCTensorMathPairwise.cu | 3 +-- lib/THC/THCTensorMathPointwise.cuh | 3 +-- lib/THC/THCTensorTypeUtils.cu | 1 - lib/THC/THCTensorTypeUtils.cuh | 1 - lib/THC/generic/THCStorageCopy.h | 2 +- lib/THC/generic/THCTensorCopy.h | 9 ++++++--- test/test.lua | 4 ++-- torch/generic/Tensor.c | 4 +--- torch/utils.h | 2 ++ 31 files changed, 41 insertions(+), 73 deletions(-) diff --git a/generic/CStorage.c b/generic/CStorage.c index d6d4db57..9be795fd 100644 --- a/generic/CStorage.c +++ b/generic/CStorage.c @@ -2,8 +2,6 @@ #define THC_GENERIC_FILE "generic/CStorage.c" #else -#include "THCHalf.h" - /* everything is as the generic Storage.c, except few things (see below) */ #define THFile_readRealRaw(file, data, size) \ diff --git a/generic/CTensor.c b/generic/CTensor.c index 64ac76a9..e22ba59d 100644 --- a/generic/CTensor.c +++ b/generic/CTensor.c @@ -2,8 +2,6 @@ #define THC_GENERIC_FILE "generic/CTensor.c" #else -#include "THCHalf.h" - /* everything is as the generic Storage.c, except few things (see below) */ #define TH_GENERIC_FILE "generic/Tensor.c" diff --git a/init.c b/init.c index d424aa6e..07ad51be 100644 --- a/init.c +++ b/init.c @@ -3,7 +3,6 @@ #include "THCGeneral.h" #include "THCCachingAllocator.h" #include "THCTensorRandom.h" -#include "THCHalf.h" // for CUDA_HALF_TENSOR extern void cutorch_CudaByteStorage_init(lua_State* L); extern void cutorch_CudaCharStorage_init(lua_State* L); @@ -911,11 +910,11 @@ static int cutorch_hasHalfInstructions(lua_State *L) { static int cutorch_hasFastHalfInstructions(lua_State *L) { THCState *state = cutorch_getstate(L); -#ifdef CUDA_HALF_TENSOR +#ifdef CUDA_HALF_TENSOR lua_pushboolean(L, THC_fastHalfInstructions(state)); #else lua_pushboolean(L, 0); -#endif +#endif return 1; } diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh index 42291144..31274e2c 100644 --- a/lib/THC/THCAtomics.cuh +++ b/lib/THC/THCAtomics.cuh @@ -1,7 +1,7 @@ #ifndef THC_ATOMICS_INC #define THC_ATOMICS_INC -#include "THCHalf.h" +#include "THCGeneral.h" template struct AtomicAddIntegerImpl; diff --git a/lib/THC/THCBlas.cu b/lib/THC/THCBlas.cu index e3462025..26ab4398 100644 --- a/lib/THC/THCBlas.cu +++ b/lib/THC/THCBlas.cu @@ -1,6 +1,4 @@ #include "THCBlas.h" -#include "THCGeneral.h" -#include "THCHalf.h" float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy) { diff --git a/lib/THC/THCBlas.h b/lib/THC/THCBlas.h index 45f58eba..5a0c58d6 100644 --- a/lib/THC/THCBlas.h +++ b/lib/THC/THCBlas.h @@ -2,7 +2,6 @@ #define THC_BLAS_INC #include "THCGeneral.h" -#include "THCHalf.h" /* Level 1 */ THC_API float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy); diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in index c50cc1c7..a08bf5a1 100644 --- a/lib/THC/THCGeneral.h.in +++ b/lib/THC/THCGeneral.h.in @@ -6,12 +6,6 @@ #include "THCThreadLocal.h" #undef log1p -#include "cuda.h" -#include "cuda_runtime.h" -#include "cublas_v2.h" - -#cmakedefine USE_MAGMA - #ifdef __cplusplus # define THC_EXTERNC extern "C" #else @@ -44,6 +38,10 @@ struct THCRNGState; /* Random number generator state. */ typedef struct THCStream THCStream; typedef struct THCState THCState; +#include "THCHalf.h" + +#cmakedefine USE_MAGMA + typedef struct _THCDeviceAllocator { cudaError_t (*malloc)( void*, void**, size_t, cudaStream_t); cudaError_t (*realloc)(void*, void**, size_t, size_t, cudaStream_t); diff --git a/lib/THC/THCGenerateHalfType.h b/lib/THC/THCGenerateHalfType.h index 501239d7..6f46e7f0 100644 --- a/lib/THC/THCGenerateHalfType.h +++ b/lib/THC/THCGenerateHalfType.h @@ -2,7 +2,7 @@ #error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h" #endif -#include "THCHalf.h" +#include "THCGeneral.h" #ifdef CUDA_HALF_TENSOR #define real half diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu index c574bf8d..bb19d498 100644 --- a/lib/THC/THCHalf.cu +++ b/lib/THC/THCHalf.cu @@ -1,4 +1,4 @@ -#include "THCHalf.h" +#include "THCGeneral.h" #include #include @@ -30,21 +30,6 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) { in, in + len, out, __half2floatOp()); } -float THC_half2float(half a) -{ - TH_half h; - h.x = a.x; - return TH_half2float(h); -} - -half THC_float2half(float a) -{ - half ret; - TH_half th_res = TH_float2half(a); - ret.x = th_res.x ; - return ret ; -} - THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) { cudaDeviceProp* prop = THCState_getCurrentDeviceProperties(state); diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index dc77584b..01b0a692 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -1,11 +1,14 @@ #ifndef THC_HALF_CONVERSION_INC #define THC_HALF_CONVERSION_INC -#include "THCGeneral.h" +#include "cuda.h" +#include "cuda_runtime.h" +#include "cublas_v2.h" +#include "cuda_fp16.h" /* We compile with CudaHalfTensor support if we have this: */ #if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 -#define CUDA_HALF_TENSOR 1 +# define CUDA_HALF_TENSOR 1 #endif #include "THHalf.h" @@ -14,8 +17,8 @@ THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len); THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len); -THC_API half THC_float2half(float a); -THC_API float THC_half2float(half a); +# define THC_float2half(a) TH_float2half(a) +# define THC_half2float(a) TH_half2float(a) /* Check for native fp16 support on the current device (CC 5.3+) */ THC_API int THC_nativeHalfInstructions(THCState *state); diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh index 09443605..4765048a 100644 --- a/lib/THC/THCNumerics.cuh +++ b/lib/THC/THCNumerics.cuh @@ -3,7 +3,8 @@ #include #include -#include "THCHalf.h" + +#include "THCGeneral.h" /// Class for numeric limits of the particular data type, which /// includes support for `half`. diff --git a/lib/THC/THCStorage.cu b/lib/THC/THCStorage.cu index a23794c2..2ceb0c7f 100644 --- a/lib/THC/THCStorage.cu +++ b/lib/THC/THCStorage.cu @@ -6,7 +6,5 @@ #include #endif -#include "THCHalf.h" - #include "generic/THCStorage.cu" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCStorage.h b/lib/THC/THCStorage.h index 025a3187..6a27fa34 100644 --- a/lib/THC/THCStorage.h +++ b/lib/THC/THCStorage.h @@ -1,7 +1,8 @@ #ifndef THC_STORAGE_INC #define THC_STORAGE_INC -#include "THCHalf.h" +#include "THCGeneral.h" + #include "THStorage.h" #define THCStorage TH_CONCAT_3(TH,CReal,Storage) diff --git a/lib/THC/THCStorageCopy.c b/lib/THC/THCStorageCopy.c index cf2bf8a6..721da7fe 100644 --- a/lib/THC/THCStorageCopy.c +++ b/lib/THC/THCStorageCopy.c @@ -1,7 +1,4 @@ #include "THCStorageCopy.h" -#include "THCGeneral.h" - -#include "THCHalf.h" #include "generic/THCStorageCopy.c" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCStorageCopy.cu b/lib/THC/THCStorageCopy.cu index 18496863..b245d90e 100644 --- a/lib/THC/THCStorageCopy.cu +++ b/lib/THC/THCStorageCopy.cu @@ -1,7 +1,6 @@ -#include "THCStorageCopy.h" #include "THCGeneral.h" +#include "THCStorageCopy.h" -#include "THCHalf.h" #include "generic/THCStorageCopy.cu" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCStorageCopy.h b/lib/THC/THCStorageCopy.h index 837056fc..ec8011d3 100644 --- a/lib/THC/THCStorageCopy.h +++ b/lib/THC/THCStorageCopy.h @@ -3,7 +3,6 @@ #include "THCStorage.h" #include "THCGeneral.h" -#include "THCHalf.h" #include "generic/THCStorageCopy.h" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensor.h b/lib/THC/THCTensor.h index d4eb49a3..8a3ab9ff 100644 --- a/lib/THC/THCTensor.h +++ b/lib/THC/THCTensor.h @@ -1,9 +1,10 @@ #ifndef THC_TENSOR_INC #define THC_TENSOR_INC +#include "THCGeneral.h" #include "THTensor.h" #include "THCStorage.h" -#include "THCGeneral.h" + #define THCTensor TH_CONCAT_3(TH,CReal,Tensor) #define THCTensor_(NAME) TH_CONCAT_4(TH,CReal,Tensor_,NAME) diff --git a/lib/THC/THCTensorCopy.c b/lib/THC/THCTensorCopy.c index 1bf8980d..9030ab6a 100644 --- a/lib/THC/THCTensorCopy.c +++ b/lib/THC/THCTensorCopy.c @@ -1,8 +1,6 @@ -#include "THCTensorCopy.h" #include "THCGeneral.h" #include "THCTensor.h" - -#include "THCHalf.h" +#include "THCTensorCopy.h" #include "generic/THCTensorCopy.c" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu index 8889939f..bb6425c5 100644 --- a/lib/THC/THCTensorCopy.cu +++ b/lib/THC/THCTensorCopy.cu @@ -1,5 +1,4 @@ #include "THCApply.cuh" -#include "THCHalf.h" #include "THCNumerics.cuh" inline int curGPU() { diff --git a/lib/THC/THCTensorCopy.h b/lib/THC/THCTensorCopy.h index e8bc4f4b..fc206cb7 100644 --- a/lib/THC/THCTensorCopy.h +++ b/lib/THC/THCTensorCopy.h @@ -3,7 +3,6 @@ #include "THCTensor.h" #include "THCGeneral.h" -#include "THCHalf.h" #include "generic/THCTensorCopy.h" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorIndex.cu b/lib/THC/THCTensorIndex.cu index 415e6256..1fe3683f 100644 --- a/lib/THC/THCTensorIndex.cu +++ b/lib/THC/THCTensorIndex.cu @@ -1,10 +1,10 @@ -#include "THC.h" -#include "THCTensorMath.h" #include "THCGeneral.h" + +#include "THCTensorMath.h" #include "THCBlas.h" #include "THCTensorCopy.h" #include "THCTensorRandom.h" -#include "THCHalf.h" + #include "THCApply.cuh" #include "THCReduce.cuh" #include "THCDeviceUtils.cuh" diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h index 3d714692..fd4d0ae0 100644 --- a/lib/THC/THCTensorMath.h +++ b/lib/THC/THCTensorMath.h @@ -2,7 +2,6 @@ #define TH_CUDA_TENSOR_MATH_INC #include "THCTensor.h" -#include "THCGeneral.h" #include "generic/THCTensorMath.h" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu index 2695f2df..c482ae78 100644 --- a/lib/THC/THCTensorMathPairwise.cu +++ b/lib/THC/THCTensorMathPairwise.cu @@ -1,6 +1,5 @@ -#include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "THCTensorMath.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCNumerics.cuh" diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh index c52e0827..cd9055a7 100644 --- a/lib/THC/THCTensorMathPointwise.cuh +++ b/lib/THC/THCTensorMathPointwise.cuh @@ -1,9 +1,8 @@ #ifndef THC_TENSORMATH_POINTWISE_CUH #define THC_TENSORMATH_POINTWISE_CUH -#include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "THCTensorMath.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCNumerics.cuh" diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu index a273a728..d3019a07 100644 --- a/lib/THC/THCTensorTypeUtils.cu +++ b/lib/THC/THCTensorTypeUtils.cu @@ -1,7 +1,6 @@ #include "THCTensorTypeUtils.cuh" #include "THCTensor.h" #include "THCTensorCopy.h" -#include "THCHalf.h" #include namespace { diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh index 25472960..ce8f21ac 100644 --- a/lib/THC/THCTensorTypeUtils.cuh +++ b/lib/THC/THCTensorTypeUtils.cuh @@ -4,7 +4,6 @@ #include #include #include "THCGeneral.h" -#include "THCHalf.h" #include "THCTensor.h" #include "THCTensorInfo.cuh" diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h index 7f1ccb08..c930a0f5 100644 --- a/lib/THC/generic/THCStorageCopy.h +++ b/lib/THC/generic/THCStorageCopy.h @@ -23,7 +23,7 @@ THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, str THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src); THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src); -#if 0 /* def CUDA_HALF_TENSOR */ +#ifdef CUDA_HALF_TENSOR THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src); #endif diff --git a/lib/THC/generic/THCTensorCopy.h b/lib/THC/generic/THCTensorCopy.h index 71d878d7..0bec3790 100644 --- a/lib/THC/generic/THCTensorCopy.h +++ b/lib/THC/generic/THCTensorCopy.h @@ -11,6 +11,9 @@ THC_API void THCTensor_(copyInt)(THCState *state, THCTensor *self, THIntTensor * THC_API void THCTensor_(copyLong)(THCState *state, THCTensor *self, THLongTensor *src); THC_API void THCTensor_(copyFloat)(THCState *state, THCTensor *self, THFloatTensor *src); THC_API void THCTensor_(copyDouble)(THCState *state, THCTensor *self, THDoubleTensor *src); +#ifdef CUDA_HALF_TENSOR +THC_API void THCTensor_(copyHalf)(THCState *state, THCTensor *self, struct THHalfTensor *src); +#endif THC_API void THCTensor_(copyCudaByte)(THCState *state, THCTensor *dst, struct THCudaByteTensor *src); THC_API void THCTensor_(copyCudaChar)(THCState *state, THCTensor *dst, struct THCudaCharTensor *src); @@ -30,15 +33,15 @@ THC_API void TH_CONCAT_2(THIntTensor_copyCuda , Real) (THCState *state, THInt THC_API void TH_CONCAT_2(THLongTensor_copyCuda , Real) (THCState *state, THLongTensor *self, THCTensor *src); THC_API void TH_CONCAT_2(THFloatTensor_copyCuda , Real) (THCState *state, THFloatTensor *self, THCTensor *src); THC_API void TH_CONCAT_2(THDoubleTensor_copyCuda, Real) (THCState *state, THDoubleTensor *self, THCTensor *src); +#ifdef CUDA_HALF_TENSOR +THC_API void TH_CONCAT_2(THHalfTensor_copyCuda, Real) (THCState *state, THDoubleTensor *self, THCTensor *src); +#endif THC_API void THCTensor_(copyCuda) (THCState *state, THCTensor *self, THCTensor *src); -/* There is no THHalfTensor */ -#ifndef THC_REAL_IS_HALF THC_API void THTensor_(copyCuda) (THCState *state, THTensor *self, THCTensor *src); THC_API void THCTensor_(copyCPU) (THCState *state, THCTensor *self, THTensor *src); THC_API void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, THTensor *src); THC_API void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, THCTensor *src); -#endif #endif diff --git a/test/test.lua b/test/test.lua index 058103d9..6525de95 100644 --- a/test/test.lua +++ b/test/test.lua @@ -2483,7 +2483,7 @@ function test.logNormal() local sz1 = chooseInt(minsize, maxsize) local sz2 = chooseInt(minsize, maxsize) local mean, std = torch.uniform(), 0.1 * torch.uniform() - local tolerance = 0.01 + local tolerance = 0.02 local t = torch.CudaTensor(sz1, sz2) t:logNormal(mean, std) @@ -3261,7 +3261,7 @@ function test.cat() end function test.catArray() - for k, typename in ipairs(typenames) do + for k, typename in ipairs(typenames) do for dim = 1, 3 do local x = torch.Tensor(13, minsize, minsize):uniform() :type(typename):transpose(1, dim) diff --git a/torch/generic/Tensor.c b/torch/generic/Tensor.c index 53b082c0..5e02ab1b 100644 --- a/torch/generic/Tensor.c +++ b/torch/generic/Tensor.c @@ -2,8 +2,6 @@ #define TH_GENERIC_FILE "generic/Tensor.c" #else -#include "THCHalf.h" - static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index, int allowNone, int allowTensor, int allowStorage, int allowStride, THCStorage **storage_, ptrdiff_t *storageOffset_, THLongStorage **size_, THLongStorage **stride_); @@ -143,7 +141,7 @@ static int torch_Tensor_(new)(lua_State *L) luaL_error(L, "invalid element (not a number)"); } -#ifndef THC_HALF +#ifdef THC_REAL_IS_HALF half value = THC_float2half((float) lua_tonumber(L, -1)); #else real value = (real) lua_tonumber(L, -1); diff --git a/torch/utils.h b/torch/utils.h index ae959b73..74d68c66 100644 --- a/torch/utils.h +++ b/torch/utils.h @@ -1,6 +1,8 @@ #ifndef CUTORCH_UTILS_INC #define CUTORCH_UTILS_INC +#include "THCGeneral.h" + #include "luaT.h" #include "TH.h"