Merge remote-tracking branch 'upstream/master' into spec_refactor

Conflicts: CMakeLists.txt Tensor.lua generic/CStorage.c generic/CTensor.c init.c lib/THC/THCCachingAllocator.cpp lib/THC/THCGeneral.c lib/THC/THCGenerateHalfType.h lib/THC/THCHalf.cu lib/THC/THCStorageCopy.c lib/THC/THCStorageCopy.cu lib/THC/THCTensorCopy.c lib/THC/generic/THCStorageCopy.c lib/THC/generic/THCStorageCopy.cu lib/THC/generic/THCStorageCopy.h lib/THC/generic/THCTensorCopy.h test/test_shutdown.lua
NVIDIA · Jan 14, 2017 · 5b7ed04 · 5b7ed04
2 parents 03a77af + 16fb59f
commit 5b7ed04
Show file tree

Hide file tree

Showing 65 changed files with 2,097 additions and 803 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,8 +11,8 @@ IF (NOT WIN32)
 SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration ${CMAKE_C_FLAGS}")
 ENDIF (NOT WIN32)
 IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
-  SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}")
-  SET(CMAKE_CXX_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_CXX_FLAGS}")
+  ADD_DEFINITIONS(-DTH_GENERIC_USE_HALF=1)
+  ADD_DEFINITIONS(-DCUDA_HAS_FP16=1)
 ENDIF()
 
 INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})

diff --git a/README.md b/README.md
@@ -30,9 +30,9 @@ Most other (besides float) CPU torch tensor types now have a cutorch equivalent,
 ### CUDA memory allocation
 Set the environment variable `THC_CACHING_ALLOCATOR=1` to enable the caching CUDA memory allocator.
 
-By default, cutorch calls `cudaMalloc` and `cudaFree` when CUDA tensors are allocated and freed. This is expensive because `cudaFree` synchronizes the CPU with the GPU. Setting `THC_CACHING_ALLOCATOR=1` will cause cutorch to cache and re-use CUDA allocations to avoid synchronizations.
+By default, cutorch calls `cudaMalloc` and `cudaFree` when CUDA tensors are allocated and freed. This is expensive because `cudaFree` synchronizes the CPU with the GPU. Setting `THC_CACHING_ALLOCATOR=1` will cause cutorch to cache and re-use CUDA device and pinned memory allocations to avoid synchronizations.
 
-With the caching memory allocator, allocations and frees should logically be considered "usages" of the memory segment associated with streams, just like kernel launches. The programmer must insert the proper synchronization if memory segments are used from multiple streams.
+With the caching memory allocator, device allocations and frees should logically be considered "usages" of the memory segment associated with streams, just like kernel launches. The programmer must insert the proper synchronization if memory segments are used from multiple streams.
 
 ###`cutorch.*` API
 - `cutorch.synchronize()` : All of the CUDA API is asynchronous (barring a few functions), which means that you can queue up operations. To wait for the operations to finish, you can issue `cutorch.synchronize()` in your code, when the code waits for all GPU operations on the current GPU to finish. WARNING: synchronizes the CPU host with respect to the current device (as per `cutorch.getDevice()`) only.

diff --git a/Storage.c b/Storage.c
@@ -6,7 +6,14 @@
 #define torch_Storage_(NAME) TH_CONCAT_4(torch_,CReal,Storage_,NAME)
 #define torch_Storage TH_CONCAT_STRING_3(torch.,CReal,Storage)
 #define cutorch_Storage_(NAME) TH_CONCAT_4(cutorch_,CReal,Storage_,NAME)
+#define cutorch_StorageCopy_(NAME) TH_CONCAT_4(cutorch_,Real,StorageCopy_,NAME)
 
+// generate the torch types -- we could also do this via THGenerateAllTypes,
+// but this allows us to be self contained.
+#define FORCE_TH_HALF
+#include "generic/CStorageCopy.c"
+#include "THCGenerateAllTypes.h"
+#undef FORCE_TH_HALF
 #include "generic/CStorage.c"
 #include "THCGenerateAllTypes.h"
 
diff --git a/Tensor.c b/Tensor.c
@@ -8,6 +8,13 @@
 #define torch_Tensor_(NAME) TH_CONCAT_4(torch_,CReal,Tensor_,NAME)
 #define torch_Tensor TH_CONCAT_STRING_3(torch.,CReal,Tensor)
 #define cutorch_Tensor_(NAME) TH_CONCAT_4(cutorch_,CReal,Tensor_,NAME)
+#define cutorch_TensorCopy_(NAME) TH_CONCAT_4(cutorch_,Real,TensorCopy_,NAME)
 
+// generate the torch types -- we could also do this via THGenerateAllTypes,
+// but this allows us to be self contained.
+#define FORCE_TH_HALF
+#include "generic/CTensorCopy.c"
+#include "THCGenerateAllTypes.h"
+#undef FORCE_TH_HALF
 #include "generic/CTensor.c"
 #include "THCGenerateAllTypes.h"
diff --git a/Tensor.lua b/Tensor.lua
@@ -77,14 +77,14 @@ local CudaTensorTypes = {
    long   = 'torch.CudaLongTensor'
 }
 
+if cutorch.hasHalf then
+   CudaTensorTypes['half'] = 'torch.CudaHalfTensor'
+end
+
 for ValueType, CudaTensorType in pairs(CudaTensorTypes) do
   local function Tensor__totable(self)
     local host_tensor = self[ValueType](self)
     return host_tensor:totable()
   end
   rawset(torch.getmetatable(CudaTensorType), 'totable', Tensor__totable)
 end
-
-if cutorch.hasHalf then
-   CudaTensorTypes.half = 'torch.CudaHalfTensor'
-end
diff --git a/TensorMath.lua b/TensorMath.lua
@@ -679,7 +679,7 @@ for k, Tensor_ in pairs(handledTypenames) do
           {name=Tensor},
           {name="boolean", creturned=true}})
 
-    for _, name in ipairs({"cmul", "cpow", "cdiv"}) do
+    for _, name in ipairs({"cmul", "cpow", "cdiv", "cremainder", "cfmod"}) do
        wrap(name,
             cname(name),
             {{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -906,20 +906,27 @@ for k, Tensor_ in pairs(handledTypenames) do
 	 {{name=Tensor, default=true, returned=true},
 	    {name=Tensor},
 	    {name=Tensor},
-	    {name="index", default=lastdim(2)}},
+	    {name="index", default=-1}},
 	 cname("catArray"),
 	 {{name=Tensor, default=true, returned=true},
 	    {name=Tensor .. "Array"},
-	    {name="index", default=lastdimarray(2)}})
+	    {name="index", default=-1}})
 
-    for _,f in ipairs({{name='geometric'},
-                       {name='bernoulli', a=0.5}}) do
+    wrap("geometric",
+        cname("geometric"),
+        {{name=Tensor, returned=true},
+            {name='double'}})
 
-       wrap(f.name,
-            cname(f.name),
-            {{name=Tensor, returned=true},
-             {name='double', default=f.a}})
-    end
+    wrap("bernoulli",
+        cname("bernoulli"),
+        {{name=Tensor, returned=true},
+            {name='double', default=0.5}},
+        cname("bernoulli_FloatTensor"),
+        {{name=Tensor, returned=true},
+            {name="CudaTensor"}},
+        cname("bernoulli_DoubleTensor"),
+        {{name=Tensor, returned=true},
+            {name="CudaDoubleTensor"}})
 
     wrap("nonzero",
          cname("nonzero"),
@@ -964,7 +971,7 @@ for k, Tensor_ in pairs(handledTypenames) do
 
      wrap("multinomial",
           cname("multinomial"),
-          {{name=Tensor, default=true, returned=true, method={default='nil'}},
+          {{name='CudaLongTensor', default=true, returned=true, method={default='nil'}},
            {name=Tensor},
            {name="int"},
            {name="boolean", default=false}})
@@ -1450,7 +1457,7 @@ wrap("equal",
       {name=Tensor},
       {name="boolean", creturned=true}})
 
-for _, name in ipairs({"cmul", "cpow", "cdiv"}) do
+for _, name in ipairs({"cmul", "cpow", "cdiv", "cremainder", "cfmod"}) do
   wrap(name,
        cname(name),
        {{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -1820,7 +1827,7 @@ wrap("randn",
 
 wrap("multinomial",
      cname("multinomial"),
-     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+     {{name='CudaLongTensor', default=true, returned=true, method={default='nil'}},
         {name=Tensor},
         {name="int"},
         {name="boolean", default=false}})
@@ -1857,25 +1864,32 @@ wrap("cat",
      {{name=Tensor, default=true, returned=true},
       {name=Tensor},
       {name=Tensor},
-      {name="index", default=lastdim(2)}},
+      {name="index", default=-1}},
      cname("catArray"),
      {{name=Tensor, default=true, returned=true},
       {name=Tensor .. "Array"},
-      {name="index", default=lastdimarray(2)}})
+      {name="index", default=-1}})
 
 wrap("nonzero",
      cname("nonzero"),
      {{name="CudaLongTensor", default=true, returned=true},
          {name=Tensor}})
 
-for _,f in ipairs({{name='geometric'},
-                   {name='bernoulli', a=0.5}}) do
-
-   wrap(f.name,
-        cname(f.name),
-        {{name=Tensor, returned=true},
-         {name=real, default=f.a}})
-end
+wrap("geometric",
+    cname("geometric"),
+    {{name=Tensor, returned=true},
+        {name='double'}})
+
+wrap("bernoulli",
+    cname("bernoulli"),
+    {{name=Tensor, returned=true},
+        {name='double', default=0.5}},
+    cname("bernoulli_FloatTensor"),
+    {{name=Tensor, returned=true},
+        {name="CudaTensor"}},
+    cname("bernoulli_DoubleTensor"),
+    {{name=Tensor, returned=true},
+        {name="CudaDoubleTensor"}})
 
 for _,f in ipairs({{name='uniform', a=0, b=1},
                    {name='normal', a=0, b=1},

diff --git a/generic/CStorage.c b/generic/CStorage.c
@@ -2,12 +2,23 @@
 #define THC_GENERIC_FILE "generic/CStorage.c"
 #else
 
+#include "THHalf.h"
+
 /* everything is as the generic Storage.c, except few things (see below) */
 
+// FixMe: Requires an unsafe conversion in that we convert from cutorch's 'half'
+// to torch's THHalf.  These types are required to be defined in the same way
+// (is there some way to enforce this?)
+#ifdef THC_REAL_IS_HALF
+#define THFILE_REAL_CAST(x) (THHalf *)x
+#else
+#define THFILE_REAL_CAST(x) x
+#endif
+
 #define THFile_readRealRaw(file, data, size)                            \
   {                                                                     \
     real *fdata = (real*)THAlloc(sizeof(real)*size);                    \
-    TH_CONCAT_3(THFile_read,Real,Raw)(file, fdata, size);               \
+    TH_CONCAT_3(THFile_read,Real,Raw)(file, THFILE_REAL_CAST(fdata), size);               \
     THCudaCheck(cudaMemcpy(data, fdata, size * sizeof(real), cudaMemcpyHostToDevice)); \
     THFree(fdata);                                                      \
   }
@@ -16,14 +27,15 @@
   {                                                                     \
     real *fdata = (real*)THAlloc(sizeof(real)*size);                    \
     THCudaCheck(cudaMemcpy(fdata, data, size * sizeof(real), cudaMemcpyDeviceToHost)); \
-    TH_CONCAT_3(THFile_write,Real,Raw)(file, fdata, size);              \
+    TH_CONCAT_3(THFile_write,Real,Raw)(file, THFILE_REAL_CAST(fdata), size);              \
     THFree(fdata);                                                      \
   }
 
 #define TH_GENERIC_FILE "generic/Storage.c"
 #include "generic/Storage.c"
 
 #undef TH_GENERIC_FILE
+#undef THFILE_REAL_CAST
 #undef THFile_readRealRaw
 #undef THFile_writeRealRaw
 
@@ -76,51 +88,6 @@ static int cutorch_Storage_(copy)(lua_State *L)
   return 1;
 }
 
-static int TH_CONCAT_3(cutorch_,Real,Storage_copy)(lua_State *L)
-{
-  THStorage *storage = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Storage));
-  void *src;
-  if( (src = luaT_toudata(L, 2, TH_CONCAT_STRING_3(torch.,Real,Storage) )))
-    THStorage_(copy)(storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) )
-    THStorage_(copyByte)(storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) )
-    THStorage_(copyChar)(storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) )
-    THStorage_(copyShort)(storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) )
-    THStorage_(copyInt)(storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) )
-    THStorage_(copyLong)(storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) )
-    THStorage_(copyFloat)(storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
-    THStorage_(copyDouble)(storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.CudaStorage")) )
-    THStorage_(copyCudaFloat)(cutorch_getstate(L), storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.CudaLongStorage")) )
-    THStorage_(copyCudaLong)(cutorch_getstate(L), storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.CudaByteStorage")) )
-    THStorage_(copyCudaByte)(cutorch_getstate(L), storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.CudaCharStorage")) )
-    THStorage_(copyCudaChar)(cutorch_getstate(L), storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.CudaShortStorage")) )
-    THStorage_(copyCudaShort)(cutorch_getstate(L), storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.CudaIntStorage")) )
-    THStorage_(copyCudaInt)(cutorch_getstate(L), storage, src);
-  else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleStorage")) )
-    THStorage_(copyCudaDouble)(cutorch_getstate(L), storage, src);
-#ifdef CUDA_HALF_TENSOR
-  else if( (src = luaT_toudata(L, 2, "torch.CudaHalfStorage")) )
-    THStorage_(copyCudaHalf)(cutorch_getstate(L), storage, src);
-#endif
-  else
-    luaL_typerror(L, 2, "torch.*Storage");
-
-  lua_settop(L, 1);
-  return 1;
-}
-
 static int cutorch_Storage_(getDevice)(lua_State *L) {
   THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
   lua_pushinteger(L, THCStorage_(getDevice)(cutorch_getstate(L), storage) + 1);
@@ -132,12 +99,8 @@ void cutorch_Storage_(init)(lua_State* L)
   /* the standard stuff */
   torch_Storage_(init)(L);
 
-  // torch_Storage macro is defined in Storage.c produce the CudaTensor types
-  // so I have to construct the normal torch types by hand
-  luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Storage));
-  lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Storage_copy));
-  lua_setfield(L, -2, "copy");
-  lua_pop(L, 1);
+  // Register this even though it is generated elsewhere.
+  cutorch_StorageCopy_(init)(L);
 
   luaT_pushmetatable(L, torch_Storage);
   lua_pushcfunction(L, cutorch_Storage_(copy));

diff --git a/generic/CStorageCopy.c b/generic/CStorageCopy.c
@@ -0,0 +1,64 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/CStorageCopy.c"
+#else
+
+#include "THHalf.h"
+
+static int TH_CONCAT_3(cutorch_,Real,Storage_copy)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Storage));
+  void *src;
+  if( (src = luaT_toudata(L, 2, TH_CONCAT_STRING_3(torch.,Real,Storage) )))
+    THStorage_(copy)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) )
+    THStorage_(copyByte)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) )
+    THStorage_(copyChar)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) )
+    THStorage_(copyShort)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) )
+    THStorage_(copyInt)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) )
+    THStorage_(copyLong)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) )
+    THStorage_(copyFloat)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
+    THStorage_(copyDouble)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.HalfStorage")) )
+    THStorage_(copyHalf)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaStorage")) )
+    THStorage_(copyCudaFloat)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaLongStorage")) )
+    THStorage_(copyCudaLong)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaByteStorage")) )
+    THStorage_(copyCudaByte)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaCharStorage")) )
+    THStorage_(copyCudaChar)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaShortStorage")) )
+    THStorage_(copyCudaShort)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaIntStorage")) )
+    THStorage_(copyCudaInt)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleStorage")) )
+    THStorage_(copyCudaDouble)(cutorch_getstate(L), storage, src);
+#ifdef CUDA_HALF_TENSOR
+  else if( (src = luaT_toudata(L, 2, "torch.CudaHalfStorage")) )
+    THStorage_(copyCudaHalf)(cutorch_getstate(L), storage, src);
+#endif
+  else
+    luaL_typerror(L, 2, "torch.*Storage");
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+void cutorch_StorageCopy_(init)(lua_State* L)
+{
+  // torch_Storage macro is defined in Storage.c produce the CudaTensor types
+  // so I have to construct the normal torch types by hand
+  luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Storage));
+  lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Storage_copy));
+  lua_setfield(L, -2, "copy");
+  lua_pop(L, 1);
+}
+
+#endif