[lua-torch-cutorch] 01/03: New upstream version 0~20170511-g92e9c08
Zhou Mo
cdluminate-guest at moszumanska.debian.org
Mon May 22 04:27:46 UTC 2017
This is an automated email from the git hooks/post-receive script.
cdluminate-guest pushed a commit to branch master
in repository lua-torch-cutorch.
commit a035f60e90e1740d09a34092194e7c65183f557c
Author: Zhou Mo <cdluminate at gmail.com>
Date: Mon May 22 04:27:07 2017 +0000
New upstream version 0~20170511-g92e9c08
---
CMakeLists.txt | 14 +
README.md | 1 +
TensorMath.lua | 144 +++++-
init.c | 27 ++
init.lua | 30 +-
lib/THC/CMakeLists.txt | 71 ++-
lib/THC/THC.h | 1 -
lib/THC/THCAsmUtils.cuh | 50 +-
lib/THC/THCAtomics.cuh | 1 +
lib/THC/THCBlas.cu | 33 ++
lib/THC/THCBlas.h | 4 +
lib/THC/THCCachingAllocator.cpp | 159 +++++-
lib/THC/THCCachingAllocator.h | 10 +
lib/THC/THCCachingHostAllocator.cpp | 84 +++-
lib/THC/THCCachingHostAllocator.h | 3 +-
lib/THC/THCGeneral.c | 231 ++++++++-
lib/THC/THCGeneral.h.in | 32 +-
lib/THC/THCHalf.cu | 90 ----
lib/THC/THCNumerics.cuh | 30 +-
lib/THC/THCReduce.cuh | 7 +-
lib/THC/THCReduceAll.cuh | 2 +-
lib/THC/THCReduceApplyUtils.cuh | 114 +++--
lib/THC/THCScanUtils.cuh | 122 ++++-
lib/THC/THCSortUtils.cu | 17 +
lib/THC/THCSortUtils.cuh | 70 ++-
lib/THC/THCStream.c | 30 --
lib/THC/THCStream.cpp | 60 +++
lib/THC/THCStream.h | 2 +
lib/THC/THCTensorConv.cu | 8 +-
lib/THC/THCTensorCopy.h | 1 +
lib/THC/THCTensorMath.cu | 26 +
lib/THC/THCTensorMath.h | 6 +
lib/THC/THCTensorMath2.cu | 2 +-
lib/THC/THCTensorMathPairwise.cu | 80 ++-
lib/THC/THCTensorMathPointwise.cuh | 129 ++++-
lib/THC/THCTensorMathReduce.cu | 4 +-
lib/THC/THCTensorMathReduce.cuh | 11 +-
lib/THC/THCTensorMathScan.cu | 10 +-
lib/THC/THCTensorMode.cu | 16 +
lib/THC/THCTensorMode.cuh | 282 +++++++++++
lib/THC/THCTensorRandom.cuh | 65 ++-
lib/THC/THCTensorScatterGather.cu | 9 +-
lib/THC/THCTensorSort.cu | 16 -
lib/THC/THCTensorSort.cuh | 1 -
lib/THC/THCTensorTopK.cu | 524 +-------------------
lib/THC/THCTensorTopK.cuh | 485 +++++++++++++++++++
lib/THC/THCTensorTopK.h | 14 -
lib/THC/THCTensorTypeUtils.cu | 8 +
lib/THC/THCTensorTypeUtils.cuh | 2 +
lib/THC/generic/THCTensor.c | 137 ++++--
lib/THC/generic/THCTensor.cu | 2 +-
lib/THC/generic/THCTensor.h | 6 +-
lib/THC/generic/THCTensorCopy.c | 8 +-
lib/THC/generic/THCTensorIndex.cu | 22 +-
lib/THC/generic/THCTensorMasked.cu | 12 +-
lib/THC/generic/THCTensorMath.cu | 140 ++++--
lib/THC/generic/THCTensorMath.h | 8 +
lib/THC/generic/THCTensorMathBlas.cu | 244 +++++++++-
lib/THC/generic/THCTensorMathBlas.h | 3 +
lib/THC/generic/THCTensorMathCompare.cu | 24 +-
lib/THC/generic/THCTensorMathCompareT.cu | 24 +-
lib/THC/generic/THCTensorMathMagma.cu | 67 ++-
lib/THC/generic/THCTensorMathPairwise.cu | 132 ++++-
lib/THC/generic/THCTensorMathPairwise.h | 5 +
lib/THC/generic/THCTensorMathPointwise.cu | 182 ++++++-
lib/THC/generic/THCTensorMathPointwise.h | 6 +
lib/THC/generic/THCTensorMathReduce.cu | 87 ++--
lib/THC/generic/THCTensorMathReduce.h | 16 +-
lib/THC/generic/THCTensorMathScan.cu | 39 +-
lib/THC/generic/THCTensorMode.cu | 315 ++++++++++++
lib/THC/generic/THCTensorMode.h | 14 +
lib/THC/generic/THCTensorRandom.cu | 27 +-
lib/THC/generic/THCTensorScatterGather.cu | 12 +-
lib/THC/generic/THCTensorSort.cu | 4 +-
lib/THC/generic/THCTensorTopK.cu | 159 ++++++
lib/THC/generic/THCTensorTopK.h | 13 +
rocks/cutorch-1.0-0.rockspec | 5 +-
rocks/cutorch-scm-1.rockspec | 4 +-
test/test.lua | 776 ++++++++++++++++++++++++++++--
torch/utils.h | 2 +-
80 files changed, 4403 insertions(+), 1230 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d1d0a0..8d3ece7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,20 @@ INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/torch")
SET(src Storage.c init.c Tensor.c TensorMath.c TensorOperator.c torch/utils.c)
SET(luasrc init.lua Tensor.lua FFI.lua test/test.lua)
+set(CMAKE_REQUIRED_INCLUDES ${LUA_INCDIR})
+include(CheckCSourceCompiles)
+check_c_source_compiles("
+#include <lauxlib.h>
+int main()
+{
+ long i = sizeof(&luaL_setfuncs);
+ return 0;
+}
+" HAS_LUAL_SETFUNCS)
+if(HAS_LUAL_SETFUNCS)
+ add_definitions(-DHAS_LUAL_SETFUNCS)
+endif()
+
ADD_TORCH_WRAP(cudatensormathwrap TensorMath.lua)
ADD_TORCH_PACKAGE(cutorch "${src}" "${luasrc}")
diff --git a/README.md b/README.md
index 3b4a174..263a131 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,7 @@ With the caching memory allocator, device allocations and frees should logically
- `cutorch.getState()` - Returns the global state of the cutorch package. This state is not for users, it stores the raw RNG states, cublas handles and other thread and device-specific stuff.
- `cutorch.withDevice(devID, f)` - This is a convenience for multi-GPU code, that takes in a device ID as well as a function f. It switches cutorch to the new device, executes the function f, and switches back cutorch to the original device.
- `cutorch.createCudaHostTensor([...])` - Allocates a `torch.FloatTensor` of [host-pinned memory](https://devblogs.nvidia.com/parallelforall/how-optimize-data-transfers-cuda-cc/), where dimensions can be given as an argument list of sizes or a `torch.LongStorage`.
+- `cutorch.isCachingAllocatorEnabled()` - Returns whether the caching CUDA memory allocator is enabled or not.
#### Low-level streams functions (dont use this as a user, easy to shoot yourself in the foot):
- `cutorch.reserveStreams(n [, nonblocking])`: creates n user streams for use on every device. NOTE: stream index `s` on device 1 is a different cudaStream_t than stream `s` on device 2. Takes an optional non-blocking flag; by default, this is assumed to be false. If true, then the stream is created with cudaStreamNonBlocking.
diff --git a/TensorMath.lua b/TensorMath.lua
index 936d897..0971de0 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -661,6 +661,18 @@ for k, Tensor_ in pairs(handledTypenames) do
{name=Tensor, method={default=1}},
{name=real}})
+ wrap("lshift",
+ cname("lshift"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
+ wrap("rshift",
+ cname("rshift"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
wrap("fmod",
cname("fmod"),
{{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -673,13 +685,33 @@ for k, Tensor_ in pairs(handledTypenames) do
{name=Tensor, method={default=1}},
{name=real}})
+ wrap("bitand",
+ cname("bitand"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
+ wrap("bitor",
+ cname("bitor"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
+ wrap("bitxor",
+ cname("bitxor"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
wrap("equal",
cname("equal"),
{{name=Tensor},
{name=Tensor},
{name="boolean", creturned=true}})
- for _, name in ipairs({"cmul", "cpow", "cdiv", "cremainder", "cfmod"}) do
+ local cfuncs = {"cmul", "cpow", "cdiv", "cremainder", "cfmod",
+ "clshift", "crshift", "cbitand", "cbitor", "cbitxor"}
+ for _, name in ipairs(cfuncs) do
wrap(name,
cname(name),
{{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -712,7 +744,8 @@ for k, Tensor_ in pairs(handledTypenames) do
{{name=Tensor, default=true, returned=true},
{name='CudaLongTensor', default=true, returned=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
end
for _,name in ipairs({"cmin", "cmax"}) do
@@ -763,7 +796,8 @@ for k, Tensor_ in pairs(handledTypenames) do
cname("sum"),
{{name=Tensor, default=true, returned=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
for _, name in ipairs({"cumsum", "cumprod"}) do
wrap(name,
@@ -780,7 +814,8 @@ for k, Tensor_ in pairs(handledTypenames) do
cname("prod"),
{{name=Tensor, default=true, returned=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
wrap("mean",
cname("meanall"),
@@ -789,7 +824,8 @@ for k, Tensor_ in pairs(handledTypenames) do
cname("mean"),
{{name=Tensor, default=true, returned=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
wrap("maskedFill",
cname("maskedFill"),
@@ -853,6 +889,24 @@ for k, Tensor_ in pairs(handledTypenames) do
{name="boolean", default=0}}
)
+ wrap("topk",
+ cname("topk"),
+ {{name=Tensor, default=true, returned=true},
+ {name="CudaLongTensor", default=true, returned=true, noreadadd=true},
+ {name=Tensor},
+ {name="long", default=1},
+ {name="index", default=lastdim(3)},
+ {name="boolean", default=0},
+ {name="boolean", default=0}})
+
+ wrap("mode",
+ cname("mode"),
+ {{name=Tensor, default=true, returned=true, noreadadd=true},
+ {name="CudaLongTensor", default=true, returned=true, noreadadd=true},
+ {name=Tensor},
+ {name="index", default=lastdim(3)},
+ {name="boolean", default=true, invisible=true}})
+
wrap("squeeze",
cname("squeeze"),
{{name=Tensor, default=true, returned=true, postcall=function(arg)
@@ -933,6 +987,13 @@ for k, Tensor_ in pairs(handledTypenames) do
{{name="CudaLongTensor", default=true, returned=true},
{name=Tensor}})
+ wrap("range",
+ cname("range"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=accreal},
+ {name=accreal},
+ {name=accreal, default=1}})
+
if real == 'float' or real == 'double' or real == 'half' then
for _,name in ipairs({"log", "log1p", "exp",
"cos", "acos", "cosh",
@@ -949,6 +1010,20 @@ for k, Tensor_ in pairs(handledTypenames) do
end
+ wrap("linspace",
+ cname("linspace"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=real},
+ {name=real},
+ {name="long", default=100}})
+
+ wrap("logspace",
+ cname("logspace"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=real},
+ {name=real},
+ {name="long", default=100}})
+
wrap("pow",
cname("pow"),
{{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -1002,7 +1077,8 @@ for k, Tensor_ in pairs(handledTypenames) do
{{name=Tensor, default=true, returned=true},
{name=Tensor},
{name=real},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
wrap("renorm",
cname("renorm"),
@@ -1029,7 +1105,8 @@ for k, Tensor_ in pairs(handledTypenames) do
{{name=Tensor, default=true, returned=true},
{name=Tensor},
{name="index"},
- {name="boolean", default=false}})
+ {name="boolean", default=false},
+ {name="boolean", default=true, invisible=true}})
end
wrap("tril",
@@ -1055,8 +1132,6 @@ for k, Tensor_ in pairs(handledTypenames) do
{{name=Tensor},
{name=accreal, creturned=true}})
-
-
wrap("lerp",
cname("lerp"),
{{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -1393,6 +1468,20 @@ wrap("zeros",
{{name=Tensor, default=true, returned=true, method={default='nil'}},
{name="LongArg"}})
+wrap("linspace",
+ cname("linspace"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=real},
+ {name=real},
+ {name="long", default=100}})
+
+wrap("logspace",
+ cname("logspace"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=real},
+ {name=real},
+ {name="long", default=100}})
+
wrap("reshape",
cname("reshape"),
{{name=Tensor, default=true, returned=true},
@@ -1457,7 +1546,9 @@ wrap("equal",
{name=Tensor},
{name="boolean", creturned=true}})
-for _, name in ipairs({"cmul", "cpow", "cdiv", "cremainder", "cfmod"}) do
+local cfuncs = {"cmul", "cpow", "cdiv", "cremainder", "cfmod",
+ "clshift", "crshift", "cbitand", "cbitor", "cbitxor"}
+for _, name in ipairs(cfuncs) do
wrap(name,
cname(name),
{{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -1552,6 +1643,14 @@ wrap("topk",
{name="boolean", default=0},
{name="boolean", default=0}})
+wrap("mode",
+ cname("mode"),
+ {{name=Tensor, default=true, returned=true, noreadadd=true},
+ {name="CudaLongTensor", default=true, returned=true, noreadadd=true},
+ {name=Tensor},
+ {name="index", default=lastdim(3)},
+ {name="boolean", default=true, invisible=true}})
+
do
local Tensor = Tensor
local real = real
@@ -1701,7 +1800,8 @@ wrap("sum",
cname("sum"),
{{name=Tensor, default=true, returned=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
for _, name in ipairs({"cumsum", "cumprod"}) do
wrap(name,
@@ -1718,7 +1818,8 @@ wrap("prod",
cname("prod"),
{{name=Tensor, default=true, returned=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
for _,name in ipairs({"min", "max"}) do
wrap(name,
@@ -1729,7 +1830,8 @@ for _,name in ipairs({"min", "max"}) do
{{name=Tensor, default=true, returned=true},
{name='CudaLongTensor', default=true, returned=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
end
for _,name in ipairs({"cmin", "cmax"}) do
@@ -1875,6 +1977,13 @@ wrap("nonzero",
{{name="CudaLongTensor", default=true, returned=true},
{name=Tensor}})
+wrap("range",
+ cname("range"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=real},
+ {name=real},
+ {name=real, default=1}})
+
wrap("geometric",
cname("geometric"),
{{name=Tensor, returned=true},
@@ -2022,7 +2131,8 @@ wrap("mean",
cname("mean"),
{{name=Tensor, default=true, returned=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
for _,name in ipairs({"var", "std"}) do
wrap(name,
@@ -2033,7 +2143,8 @@ for _,name in ipairs({"var", "std"}) do
{{name=Tensor, default=true, returned=true},
{name=Tensor},
{name="index"},
- {name="boolean", default=false}})
+ {name="boolean", default=false},
+ {name="boolean", default=true, invisible=true}})
end
wrap("norm",
@@ -2045,7 +2156,8 @@ wrap("norm",
{{name=Tensor, default=true, returned=true},
{name=Tensor},
{name=real},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
wrap("renorm",
cname("renorm"),
diff --git a/init.c b/init.c
index 894be2e..8b32a1a 100644
--- a/init.c
+++ b/init.c
@@ -699,6 +699,14 @@ static int cutorch_setKernelPeerToPeerAccess(lua_State *L)
return 0;
}
+static int cutorch_isCachingAllocatorEnabled(lua_State *L)
+{
+ THCState *state = cutorch_getstate(L);
+ lua_pushboolean(L, THCState_isCachingAllocatorEnabled(state));
+
+ return 1;
+}
+
static int cutorch_getMemoryUsage(lua_State *L) {
size_t freeBytes = 0;
size_t totalBytes = 0;
@@ -780,6 +788,22 @@ static int cutorch_getDeviceProperties(lua_State *L)
return 1;
}
+static int cutorch_getRuntimeVersion(lua_State *L)
+{
+ int version;
+ THCudaCheck(cudaRuntimeGetVersion(&version));
+ lua_pushnumber(L, version);
+ return 1;
+}
+
+static int cutorch_getDriverVersion(lua_State *L)
+{
+ int version;
+ THCudaCheck(cudaDriverGetVersion(&version));
+ lua_pushnumber(L, version);
+ return 1;
+}
+
static int cutorch_seed(lua_State *L)
{
unsigned long long seed = THCRandom_seed(cutorch_getstate(L));
@@ -977,7 +1001,10 @@ static const struct luaL_Reg cutorch_stuff__ [] = {
{"setPeerToPeerAccess", cutorch_setPeerToPeerAccess},
{"setKernelPeerToPeerAccess", cutorch_setKernelPeerToPeerAccess},
{"getKernelPeerToPeerAccess", cutorch_getKernelPeerToPeerAccess},
+ {"isCachingAllocatorEnabled", cutorch_isCachingAllocatorEnabled},
{"getDeviceProperties", cutorch_getDeviceProperties},
+ {"getRuntimeVersion", cutorch_getRuntimeVersion},
+ {"getDriverVersion", cutorch_getDriverVersion},
{"getMemoryUsage", cutorch_getMemoryUsage},
{"hasHalfInstructions", cutorch_hasHalfInstructions},
{"hasFastHalfInstructions", cutorch_hasFastHalfInstructions},
diff --git a/init.lua b/init.lua
index fdb7b08..59665c3 100644
--- a/init.lua
+++ b/init.lua
@@ -49,27 +49,23 @@ local function longTensorSize(...)
return size
end
--- Creates a FloatTensor using the CudaHostAllocator.
--- Accepts either a LongStorage or a sequence of numbers.
-function cutorch.createCudaHostTensor(...)
- local size = longTensorSize(...)
- local storage = torch.FloatStorage(cutorch.CudaHostAllocator, size:prod())
- return torch.FloatTensor(storage, 1, size:storage())
+local hostTypes = {'Float', 'Double', 'Int', 'Long', 'Byte'}
+if cutorch.hasHalf then
+ table.insert(hostTypes, 'Half')
end
-function cutorch.createCudaHostDoubleTensor(...)
- local size = longTensorSize(...)
- local storage = torch.DoubleStorage(cutorch.CudaHostAllocator, size:prod())
- return torch.DoubleTensor(storage, 1, size:storage())
+for _, ty in ipairs(hostTypes) do
+ -- Creates torch Tensors using the CudaHostAllocator.
+ -- Accepts either a LongStorage or a sequence of numbers.
+ cutorch['createCudaHost' .. ty .. 'Tensor'] = function(...)
+ local size = longTensorSize(...)
+ local storage = torch[ty .. 'Storage'](cutorch.CudaHostAllocator, size:prod())
+ return torch[ty .. 'Tensor'](storage, 1, size:storage())
+ end
end
-if cutorch.hasHalf then
- function cutorch.createCudaHostHalfTensor(...)
- local size = longTensorSize(...)
- local storage = torch.HalfStorage(cutorch.CudaHostAllocator, size:prod())
- return torch.HalfTensor(storage, 1, size:storage())
- end
- end
+-- Alias to automate creation from both torch and cutorch types
+cutorch.createCudaHostTensor = cutorch.createCudaHostFloatTensor
-- Creates a CudaTensor using the CudaUVAAllocator.
-- Accepts either a LongStorage or a sequence of numbers.
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
index 0e08120..1ea6039 100644
--- a/lib/THC/CMakeLists.txt
+++ b/lib/THC/CMakeLists.txt
@@ -3,6 +3,7 @@ CMAKE_POLICY(VERSION 2.8)
SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
OPTION(NDEBUG "disable asserts (WARNING: this may result in invalid memory accesses)")
IF(NOT NDEBUG)
MESSAGE(STATUS "Removing -DNDEBUG from compile flags")
@@ -50,6 +51,7 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
THCTensorRandom.cpp
THCCachingAllocator.cpp
THCCachingHostAllocator.cpp
+ THCStream.cpp
PROPERTIES COMPILE_FLAGS -std=${CXX_VERSION})
ELSE()
SET(CMAKE_CXX_STANDARD 11)
@@ -59,6 +61,10 @@ ENDIF()
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
INCLUDE_DIRECTORIES("${CUDA_SDK_ROOT_DIR}/common/inc")
+IF ("$ENV{STATIC_TH}" STREQUAL "YES")
+LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+ENDIF()
+
IF(MAGMA_FOUND)
INCLUDE_DIRECTORIES(${MAGMA_INCLUDE_DIR})
SET(CMAKE_REQUIRED_INCLUDES "${MAGMA_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}")
@@ -130,9 +136,9 @@ IF(NOT THC_INSTALL_BIN_SUBDIR
SET(THC_INSTALL_CMAKE_SUBDIR ${Torch_INSTALL_CMAKE_SUBDIR})
ELSE(Torch_INSTALL_BIN_SUBDIR)
# not installing in a Torch context, so Torch_INSTALL_BIN_SUBDIR is not available
- SET(THC_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THC install binary subdirectory")
- SET(THC_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THC install library subdirectory")
- SET(THC_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THC install include subdirectory")
+ SET(THC_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THC install binary subdirectory")
+ SET(THC_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THC install library subdirectory")
+ SET(THC_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THC install include subdirectory")
SET(THC_INSTALL_CMAKE_SUBDIR "share/cmake/THC" CACHE PATH "THC install cmake subdirectory")
ENDIF(Torch_INSTALL_BIN_SUBDIR)
@@ -153,7 +159,7 @@ SET(src
THCCachingHostAllocator.cpp
THCGeneral.c
THCStorageCopy.c
- THCStream.c
+ THCStream.cpp
THCTensor.c
THCTensorCopy.c
THCTensorRandom.cpp
@@ -182,6 +188,8 @@ SET(src-cuda
THCTensorTopK.cu
THCTensorSort.cu
THCTensorTypeUtils.cu
+ THCSortUtils.cu
+ THCTensorMode.cu
)
# loop over all types
@@ -208,28 +216,33 @@ ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
+IF ("$ENV{STATIC_TH}" STREQUAL "YES")
+ CUDA_ADD_LIBRARY(THC STATIC ${src} ${src-cuda})
+ SET_TARGET_PROPERTIES(THC PROPERTIES COMPILE_FLAGS "-fPIC")
+ELSE()
+ CUDA_ADD_LIBRARY(THC SHARED ${src} ${src-cuda})
+ CUDA_ADD_CUBLAS_TO_TARGET(THC)
+ TARGET_LINK_LIBRARIES(THC ${TH_LIBRARIES} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY})
+
+ IF(USE_MAGMA)
+ TARGET_LINK_LIBRARIES(THC ${MAGMA_LIBRARIES})
+ ENDIF(USE_MAGMA)
+
+ IF(NOT THC_SO_VERSION)
+ SET(THC_SO_VERSION 0)
+ ENDIF(NOT THC_SO_VERSION)
+ MESSAGE(STATUS "THC_SO_VERSION: ${THC_SO_VERSION}")
+ SET_TARGET_PROPERTIES(THC PROPERTIES
+ VERSION ${THC_SO_VERSION}
+ SOVERSION ${THC_SO_VERSION})
+
+
+ INSTALL(TARGETS THC
+ RUNTIME DESTINATION "${THC_INSTALL_BIN_SUBDIR}"
+ LIBRARY DESTINATION "${THC_INSTALL_LIB_SUBDIR}"
+ ARCHIVE DESTINATION "${THC_INSTALL_LIB_SUBDIR}")
+ENDIF()
-CUDA_ADD_LIBRARY(THC SHARED ${src} ${src-cuda})
-CUDA_ADD_CUBLAS_TO_TARGET(THC)
-TARGET_LINK_LIBRARIES(THC ${TH_LIBRARIES} ${CUDA_curand_LIBRARY})
-
-IF(USE_MAGMA)
- TARGET_LINK_LIBRARIES(THC ${MAGMA_LIBRARIES} ${CUDA_cusparse_LIBRARY})
-ENDIF(USE_MAGMA)
-
-IF(NOT THC_SO_VERSION)
- SET(THC_SO_VERSION 0)
-ENDIF(NOT THC_SO_VERSION)
-MESSAGE(STATUS "THC_SO_VERSION: ${THC_SO_VERSION}")
-SET_TARGET_PROPERTIES(THC PROPERTIES
- VERSION ${THC_SO_VERSION}
- SOVERSION ${THC_SO_VERSION})
-
-
-INSTALL(TARGETS THC
- RUNTIME DESTINATION "${THC_INSTALL_BIN_SUBDIR}"
- LIBRARY DESTINATION "${THC_INSTALL_LIB_SUBDIR}"
- ARCHIVE DESTINATION "${THC_INSTALL_LIB_SUBDIR}")
INSTALL(FILES
THC.h
@@ -245,7 +258,6 @@ INSTALL(FILES
THCTensorRandom.h
THCTensorMath.h
THCTensorConv.h
- THCTensorTopK.h
THCApply.cuh
THCReduce.cuh
THCReduceAll.cuh
@@ -276,10 +288,13 @@ INSTALL(FILES
THCNumerics.cuh
THCTensorSort.cuh
THCTensorInfo.cuh
+ THCTensorMathPointwise.cuh
THCTensorTypeUtils.cuh
THCTensorRandom.cuh
THCTensorMathMagma.cuh
THCThrustAllocator.cuh
+ THCTensorMode.cuh
+ THCTensorTopK.cuh
DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC")
INSTALL(FILES
@@ -324,4 +339,8 @@ INSTALL(FILES
generic/THCDeviceTensorUtils.cu
generic/THCTensorRandom.h
generic/THCTensorRandom.cu
+ generic/THCTensorMode.h
+ generic/THCTensorMode.cu
+ generic/THCTensorTopK.h
+ generic/THCTensorTopK.cu
DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC/generic")
diff --git a/lib/THC/THC.h b/lib/THC/THC.h
index e3840dc..90a3a53 100644
--- a/lib/THC/THC.h
+++ b/lib/THC/THC.h
@@ -15,6 +15,5 @@
#include "THCTensorRandom.h"
#include "THCTensorMath.h"
#include "THCTensorConv.h"
-#include "THCTensorTopK.h"
#endif
diff --git a/lib/THC/THCAsmUtils.cuh b/lib/THC/THCAsmUtils.cuh
index 7015d20..f0dc90b 100644
--- a/lib/THC/THCAsmUtils.cuh
+++ b/lib/THC/THCAsmUtils.cuh
@@ -3,20 +3,44 @@
// Collection of direct PTX functions
-__device__ __forceinline__
-unsigned int getBitfield(unsigned int val, int pos, int len) {
- unsigned int ret;
- asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
- return ret;
-}
+template <typename T>
+struct Bitfield {};
-__device__ __forceinline__
-unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
- unsigned int ret;
- asm("bfi.b32 %0, %1, %2, %3, %4;" :
- "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
- return ret;
-}
+template <>
+struct Bitfield<unsigned int> {
+ static __device__ __forceinline__
+ unsigned int getBitfield(unsigned int val, int pos, int len) {
+ unsigned int ret;
+ asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
+ return ret;
+ }
+
+ static __device__ __forceinline__
+ unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
+ unsigned int ret;
+ asm("bfi.b32 %0, %1, %2, %3, %4;" :
+ "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
+ return ret;
+ }
+};
+
+template <>
+struct Bitfield<unsigned long long int> {
+ static __device__ __forceinline__
+ unsigned long long int getBitfield(unsigned long long int val, int pos, int len) {
+ unsigned long long int ret;
+ asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
+ return ret;
+ }
+
+ static __device__ __forceinline__
+ unsigned long long int setBitfield(unsigned long long int val, unsigned long long int toInsert, int pos, int len) {
+ unsigned long long int ret;
+ asm("bfi.b64 %0, %1, %2, %3, %4;" :
+ "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
+ return ret;
+ }
+};
__device__ __forceinline__ int getLaneId() {
int laneId;
diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh
index ac0b45f..7a0be48 100644
--- a/lib/THC/THCAtomics.cuh
+++ b/lib/THC/THCAtomics.cuh
@@ -2,6 +2,7 @@
#define THC_ATOMICS_INC
#include "THCHalf.h"
+#include "THCNumerics.cuh"
template <typename T, size_t n>
struct AtomicAddIntegerImpl;
diff --git a/lib/THC/THCBlas.cu b/lib/THC/THCBlas.cu
index c438ad8..9db4f0b 100644
--- a/lib/THC/THCBlas.cu
+++ b/lib/THC/THCBlas.cu
@@ -389,6 +389,39 @@ void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot,
THCublasCheck(cublasDgetrfBatched(handle, n, a, lda, pivot, info, batchSize));
}
+THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize)
+{
+ if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) )
+ {
+ THError("Cublas_Dgetrs only supports n, nrhs, lda, ldb, batchSize"
+ "with the bound [val] <= %d", INT_MAX);
+ }
+
+ // no need to adjust leading dimensions, since matrices are square
+ cublasOperation_t opa = convertTransToCublasOperation(transa);
+
+ cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+ cublasSetStream(handle, THCState_getCurrentStream(state));
+ THCublasCheck(cublasSgetrsBatched(handle, opa, n, nrhs, a, lda, pivot, b, ldb, info, batchSize));
+}
+
+
+THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize)
+{
+ if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) )
+ {
+ THError("Cublas_Dgetrs only supports n, nrhs, lda, ldb, batchSize"
+ "with the bound [val] <= %d", INT_MAX);
+ }
+
+ // no need to adjust leading dimensions, since matrices are square
+ cublasOperation_t opa = convertTransToCublasOperation(transa);
+
+ cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+ cublasSetStream(handle, THCState_getCurrentStream(state));
+ THCublasCheck(cublasDgetrsBatched(handle, opa, n, nrhs, a, lda, pivot, b, ldb, info, batchSize));
+}
+
void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) {
if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) )
diff --git a/lib/THC/THCBlas.h b/lib/THC/THCBlas.h
index bf91f93..25246b1 100644
--- a/lib/THC/THCBlas.h
+++ b/lib/THC/THCBlas.h
@@ -35,6 +35,10 @@ THC_API void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb,
/* Inverse */
THC_API void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize);
THC_API void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize);
+
+THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize);
+THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize);
+
THC_API void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize);
THC_API void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize);
diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp
index eeae04a..11d1467 100644
--- a/lib/THC/THCCachingAllocator.cpp
+++ b/lib/THC/THCCachingAllocator.cpp
@@ -1,6 +1,7 @@
#include "THCCachingAllocator.h"
#include <cuda_runtime_api.h>
+#include <deque>
#include <map>
#include <memory>
#include <mutex>
@@ -17,7 +18,7 @@
// split. If no block is found, the allocator will delegate to cudaMalloc.
// - If the cudaMalloc fails, the allocator will free all cached blocks that
// are not split and retry the allocation.
-// - Large (>1MB) and small allocation requestss are handled separately. Large
+// - Large (>1MB) and small allocation requests are handled separately. Large
// allocation requests can be filled by a cudaMalloc call of the exact size.
// Small requests will allocate and split a 1MB buffer, if necessary.
//
@@ -26,26 +27,36 @@
// launches. The programmer must insert the proper synchronization if memory
// segments are used from multiple streams.
//
+// The library provides a recordStream() function to help insert the correct
+// synchronization when allocations are used on multiple streams. This will
+// ensure that the block is not reused before each recorded stream completes
+// work.
+//
namespace {
+typedef std::shared_ptr<THCStream> THCStreamPtr;
+typedef std::set<THCStreamPtr> stream_set;
+
const size_t kRoundSmall = 512; // round up small allocs to 512 bytes
const size_t kRoundLarge = 131072; // round up large allocs to 128 KiB
const size_t kSmallAlloc = 1048576; // largest "small" allocation is 1 MiB
struct Block {
- int device; // gpu
- cudaStream_t stream; // allocation stream
- size_t size; // block size in bytes
- char* ptr; // memory address
- bool allocated; // in-use flag
- Block* prev; // prev block if split from a larger allocation
- Block* next; // next block if split from a larger allocation
+ int device; // gpu
+ cudaStream_t stream; // allocation stream
+ stream_set stream_uses; // streams on which the block was used
+ size_t size; // block size in bytes
+ char* ptr; // memory address
+ bool allocated; // in-use flag
+ Block* prev; // prev block if split from a larger allocation
+ Block* next; // next block if split from a larger allocation
+ int event_count; // number of outstanding CUDA events
Block(int device, cudaStream_t stream, size_t size, char* ptr=NULL) :
- device(device), stream(stream), size(size), ptr(ptr), allocated(0),
- prev(NULL), next(NULL) { }
+ device(device), stream(stream), stream_uses(), size(size), ptr(ptr),
+ allocated(0), prev(NULL), next(NULL), event_count(0) { }
};
static bool BlockComparator(const Block* a, const Block* b)
@@ -69,9 +80,12 @@ struct THCCachingAllocator
typedef bool (*Comparison)(const Block*, const Block*);
typedef std::set<Block*, Comparison> FreeBlocks;
- // lock around malloc and free
+ // lock around all operations
std::mutex mutex;
+ // lock around calls to cudaFree (to prevent deadlocks with NCCL)
+ std::mutex cuda_free_mutex;
+
// cached blocks larger than 1 MB
FreeBlocks large_blocks;
@@ -81,6 +95,9 @@ struct THCCachingAllocator
// allocated blocks by device pointer
std::unordered_map<void*, Block*> allocated_blocks;
+ // outstanding cuda events
+ std::deque<std::pair<cudaEvent_t, Block*>> cuda_events;
+
THCCachingAllocator() :
large_blocks(BlockComparator),
small_blocks(BlockComparator) {}
@@ -96,6 +113,11 @@ struct THCCachingAllocator
return err;
}
+ err = process_events();
+ if (err != cudaSuccess) {
+ return err;
+ }
+
size = round_size(size);
bool small = size <= kSmallAlloc;
@@ -156,15 +178,13 @@ struct THCCachingAllocator
Block* block = it->second;
allocated_blocks.erase(it);
-
- bool small = block->size <= kSmallAlloc;
- auto& free_blocks = small ? large_blocks : small_blocks;
- try_merge_blocks(block, block->prev, free_blocks);
- try_merge_blocks(block, block->next, free_blocks);
-
block->allocated = false;
- free_blocks.insert(block);
+ if (!block->stream_uses.empty()) {
+ return insert_events(block);
+ }
+
+ free_block(block);
return cudaSuccess;
}
@@ -226,10 +246,37 @@ struct THCCachingAllocator
cacheInfoAux(small_blocks, dev_id, total, largest);
}
+ void recordStream(void* ptr, THCStream* stream)
+ {
+ std::lock_guard<std::mutex> lock(mutex);
+ Block* block = find_allocated_block(ptr);
+ if (!block) {
+ THError("invalid device pointer: %p", ptr);
+ }
+ if (stream->stream == block->stream) {
+ // ignore uses on the allocation stream, since those don't require any
+ // special synchronization
+ return;
+ }
+ THCStream_retain(stream);
+ block->stream_uses.insert(THCStreamPtr(stream, &THCStream_free));
+ }
+
+ /** moves a block into the free block list */
+ void free_block(Block* block)
+ {
+ THAssert(!block->allocated && block->event_count == 0);
+ bool small = block->size <= kSmallAlloc;
+ auto& free_blocks = small ? large_blocks : small_blocks;
+ try_merge_blocks(block, block->prev, free_blocks);
+ try_merge_blocks(block, block->next, free_blocks);
+ free_blocks.insert(block);
+ }
+
/** combine previously split blocks */
void try_merge_blocks(Block* dst, Block* src, FreeBlocks& free_blocks)
{
- if (!src || src->allocated) {
+ if (!src || src->allocated || src->event_count > 0) {
return;
}
if (dst->prev == src) {
@@ -303,6 +350,7 @@ struct THCCachingAllocator
cudaError_t free_blocks(FreeBlocks& blocks, FreeBlocks::iterator it, FreeBlocks::iterator end)
{
// Frees all non-split blocks between `it` and `end`
+ std::lock_guard<std::mutex> lock(cuda_free_mutex);
while (it != end) {
Block* block = *it;
if (!block->prev && !block->next) {
@@ -328,6 +376,69 @@ struct THCCachingAllocator
}
return it->second;
}
+
+ cudaError_t insert_events(Block* block)
+ {
+ cudaError_t err;
+
+ int prev_device;
+ err = cudaGetDevice(&prev_device);
+ if (err != cudaSuccess) return err;
+
+ std::set<THCStreamPtr> streams(std::move(block->stream_uses));
+ THAssert(block->stream_uses.empty());
+ for (auto it = streams.begin(); it != streams.end(); ++it) {
+ auto& stream = *it;
+
+ err = cudaSetDevice(stream->device);
+ if (err != cudaSuccess) break;
+
+ cudaEvent_t event;
+ err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+ if (err != cudaSuccess) break;
+
+ err = cudaEventRecord(event, stream->stream);
+ if (err != cudaSuccess) break;
+
+ block->event_count++;
+ cuda_events.emplace_back(event, block);
+ }
+
+ cudaSetDevice(prev_device);
+ return err;
+ }
+
+ cudaError_t process_events()
+ {
+ // Process outstanding cudaEvents. Events that are completed are removed
+ // from the queue, and the 'event_count' for the corresponding allocation
+ // is decremented. Stops at the first event which has not been completed.
+ // Since events on different devices or streams may occur out of order,
+ // the processing of some events may be delayed.
+ while (!cuda_events.empty()) {
+ auto& e = cuda_events.front();
+ cudaEvent_t event = e.first;
+ Block* block = e.second;
+
+ cudaError_t err = cudaEventQuery(event);
+ if (err == cudaErrorNotReady) {
+ break;
+ } else if (err != cudaSuccess) {
+ return err;
+ }
+ err = cudaEventDestroy(event);
+ if (err != cudaSuccess) {
+ return err;
+ }
+
+ block->event_count--;
+ if (block->event_count == 0) {
+ free_block(block);
+ }
+ cuda_events.pop_front();
+ }
+ return cudaSuccess;
+ }
};
static cudaError_t THCCachingAllocator_malloc(void* ctx, void** ptr, size_t size, cudaStream_t stream)
@@ -374,3 +485,13 @@ THC_API void* THCCachingAllocator_getBaseAllocation(void *ptr, size_t *size)
{
return caching_allocator.getBaseAllocation(ptr, size);
}
+
+THC_API void THCCachingAllocator_recordStream(void *ptr, THCStream* stream)
+{
+ caching_allocator.recordStream(ptr, stream);
+}
+
+THC_API std::mutex* THCCachingAllocator_getCudaFreeMutex()
+{
+ return &caching_allocator.cuda_free_mutex;
+}
diff --git a/lib/THC/THCCachingAllocator.h b/lib/THC/THCCachingAllocator.h
index 3eb3725..fbf9109 100644
--- a/lib/THC/THCCachingAllocator.h
+++ b/lib/THC/THCCachingAllocator.h
@@ -1,9 +1,19 @@
#ifndef THC_DEVICE_ALLOCATOR_INC
#define THC_DEVICE_ALLOCATOR_INC
+#if __cplusplus >= 201103L
+#include <mutex>
+#endif
+
#include "THCGeneral.h"
+#include "THCStream.h"
THC_API THCDeviceAllocator* THCCachingAllocator_get(void);
THC_API void* THCCachingAllocator_getBaseAllocation(void *ptr, size_t *size);
+THC_API void THCCachingAllocator_recordStream(void *ptr, THCStream* stream);
+
+#if __cplusplus >= 201103L
+THC_API std::mutex* THCCachingAllocator_getCudaFreeMutex();
+#endif
#endif
diff --git a/lib/THC/THCCachingHostAllocator.cpp b/lib/THC/THCCachingHostAllocator.cpp
index 3cbbccb..a43cb30 100644
--- a/lib/THC/THCCachingHostAllocator.cpp
+++ b/lib/THC/THCCachingHostAllocator.cpp
@@ -2,6 +2,7 @@
#include <cuda_runtime_api.h>
#include <deque>
+#include <memory>
#include <mutex>
#include <set>
#include <stdint.h>
@@ -11,6 +12,8 @@
namespace {
+typedef std::shared_ptr<THCStream> THCStreamPtr;
+
struct BlockSize
{
size_t size; // allocation size
@@ -23,9 +26,10 @@ struct Block : public BlockSize
{
bool allocated; // true if the block is currently allocated
int event_count; // number of outstanding cuda events
+ std::set<THCStreamPtr> streams;
Block(size_t size, void* ptr, bool allocated) :
- BlockSize(size, ptr), allocated(allocated), event_count(0) { }
+ BlockSize(size, ptr), allocated(allocated), event_count(0), streams() {}
};
static bool BlockComparator(const BlockSize& a, const BlockSize& b)
@@ -98,13 +102,28 @@ struct HostAllocator
return cudaSuccess;
}
+ // process outstanding cuda events which may have occurred
+ cudaError_t err = processEvents();
+ if (err != cudaSuccess) {
+ return err;
+ }
+
auto it = blocks.find(ptr);
THAssert(it != blocks.end());
Block& block = it->second;
THAssert(block.allocated);
+ // free (on valid memory) shouldn't fail, so mark unallocated before
+ // we process the streams.
block.allocated = false;
+
+ // insert CUDA events for each stream on which this block was used. This
+ err = insertEvents(block);
+ if (err != cudaSuccess) {
+ return err;
+ }
+
if (block.event_count == 0) {
// the block can be re-used if there are no outstanding cuda events
available.insert(block);
@@ -112,7 +131,7 @@ struct HostAllocator
return cudaSuccess;
}
- cudaError_t recordEvent(void* ptr, cudaStream_t stream)
+ cudaError_t recordEvent(void* ptr, THCStream *stream)
{
std::lock_guard<std::mutex> lock(mutex);
cudaError_t err;
@@ -126,26 +145,10 @@ struct HostAllocator
Block& block = it->second;
THAssert(block.allocated);
- // process outstanding cuda events which may have occurred
- err = processEvents();
- if (err != cudaSuccess) {
- return err;
- }
+ THCStreamPtr stream_ptr(stream, &THCStream_free);
+ THCStream_retain(stream);
- // create and record an event in the given stream
- cudaEvent_t event;
- err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
- if (err != cudaSuccess) {
- return err;
- }
- err = cudaEventRecord(event, stream);
- if (err != cudaSuccess) {
- return err;
- }
-
- // the block will not be re-used until all associated events have occured
- block.event_count++;
- cuda_events.emplace_back(event, ptr);
+ block.streams.insert(std::move(stream_ptr));
return cudaSuccess;
}
@@ -186,18 +189,17 @@ struct HostAllocator
std::lock_guard<std::mutex> lock(mutex);
// remove events for freed blocks
- std::deque<std::pair<cudaEvent_t, void*>> new_events;
for (auto it = cuda_events.begin(); it != cuda_events.end(); ++it) {
cudaEvent_t event = it->first;
Block& block = blocks.at(it->second);
if (!block.allocated) {
THCudaCheckWarn(cudaEventDestroy(event));
block.event_count--;
- } else {
- new_events.push_back(*it);
}
}
- cuda_events.swap(new_events);
+
+ // all cuda_events have been processed
+ cuda_events.clear();
// clear list of available blocks
available.clear();
@@ -213,6 +215,36 @@ struct HostAllocator
}
}
}
+
+ cudaError_t insertEvents(Block& block)
+ {
+ cudaError_t err;
+
+ int prev_device;
+ err = cudaGetDevice(&prev_device);
+ if (err != cudaSuccess) return err;
+
+ std::set<THCStreamPtr> streams(std::move(block.streams));
+ for (auto it = streams.begin(); it != streams.end(); ++it) {
+ auto& stream = *it;
+
+ err = cudaSetDevice(stream->device);
+ if (err != cudaSuccess) break;
+
+ cudaEvent_t event;
+ err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+ if (err != cudaSuccess) break;
+
+ err = cudaEventRecord(event, stream->stream);
+ if (err != cudaSuccess) break;
+
+ block.event_count++;
+ cuda_events.emplace_back(event, block.ptr);
+ }
+
+ cudaSetDevice(prev_device);
+ return err;
+ }
};
} // namespace
@@ -232,7 +264,7 @@ static void THCCachingHostAllocator_free(void* ctx, void* ptr)
allocator.free(ptr);
}
-cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, cudaStream_t stream)
+cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream)
{
return allocator.recordEvent(ptr, stream);
}
diff --git a/lib/THC/THCCachingHostAllocator.h b/lib/THC/THCCachingHostAllocator.h
index a695565..05513ac 100644
--- a/lib/THC/THCCachingHostAllocator.h
+++ b/lib/THC/THCCachingHostAllocator.h
@@ -2,6 +2,7 @@
#define THC_CACHING_HOST_ALLOCATOR_INC
#include "THCGeneral.h"
+#include "THCStream.h"
//
// A caching allocator for CUDA host allocations (pinned memory).
@@ -22,7 +23,7 @@ THC_API THAllocator THCCachingHostAllocator;
// Records an event in the specified stream. The allocation 'ptr' will not be
// re-used until the event has occured.
-THC_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, cudaStream_t stream);
+THC_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream);
// Releases cached pinned memory allocations via cudaHostFree
THC_API void THCCachingHostAllocator_emptyCache(void);
diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c
index c442bd8..e99487e 100644
--- a/lib/THC/THCGeneral.c
+++ b/lib/THC/THCGeneral.c
@@ -75,6 +75,7 @@ void THCudaInit(THCState* state)
state->currentStreams[i] = THCThreadLocal_alloc();
}
state->currentPerDeviceBlasHandle = THCThreadLocal_alloc();
+ state->currentPerDeviceSparseHandle = THCThreadLocal_alloc();
state->resourcesPerDevice = (THCCudaResourcesPerDevice*)
malloc(numDevices * sizeof(THCCudaResourcesPerDevice));
@@ -107,9 +108,9 @@ void THCudaInit(THCState* state)
THCudaCheck(cudaSetDevice(i));
THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i));
- // Allocate space for the NULL stream
+ // Allocate space for the default stream
res->streams = (THCStream**) malloc(sizeof(THCStream*));
- res->streams[0] = NULL;
+ res->streams[0] = THCStream_defaultStream(i);
/* The scratch space that we want to have available per each device is
based on the number of SMs available per device. We guarantee a
@@ -131,6 +132,7 @@ void THCudaInit(THCState* state)
// cuBLAS handle is the first user BLAS handle. Note that the actual BLAS
// handles are created lazily.
state->numUserBlasHandles = 1;
+ state->numUserSparseHandles = 1;
state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically
state->heapDelta = 0;
@@ -158,14 +160,18 @@ void THCudaShutdown(THCState* state)
for (int dev = 0; dev < deviceCount; ++dev) {
THCudaCheck(cudaSetDevice(dev));
THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]);
- /* Free user reserved streams (0 is the default stream) */
- for (int i = 1; i <= state->numUserStreams; ++i) {
+ /* Free all streams */
+ for (int i = 0; i <= state->numUserStreams; ++i) {
THCStream_free(res->streams[i]);
}
/* Free user defined BLAS handles */
for (int i = 0; i < res->numBlasHandles; ++i) {
THCublasCheck(cublasDestroy(res->blasHandles[i]));
}
+ /* Free user defined sparse handles */
+ for (int i = 0; i < res->numSparseHandles; ++i) {
+ THCusparseCheck(cusparseDestroy(res->sparseHandles[i]));
+ }
/* Free per-stream scratch space; starts at 0 because there is space for
the default stream as well*/
if (res->devScratchSpacePerStream) {
@@ -176,6 +182,7 @@ void THCudaShutdown(THCState* state)
free(res->streams);
free(res->blasHandles);
+ free(res->sparseHandles);
free(res->devScratchSpacePerStream);
THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev]));
THCThreadLocal_free(state->currentStreams[dev]);
@@ -293,11 +300,20 @@ THAllocator* THCState_getCudaUVAAllocator(THCState* state)
return state->cudaUVAAllocator;
}
+THC_API THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state)
+{
+ return state->cudaDeviceAllocator;
+}
+
void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator)
{
state->cudaDeviceAllocator = allocator;
}
+int THCState_isCachingAllocatorEnabled(THCState* state) {
+ return state->cudaHostAllocator == &THCCachingHostAllocator;
+}
+
int THCState_getNumDevices(THCState *state)
{
return state->numDevices;
@@ -383,6 +399,29 @@ void THCState_reserveDeviceBlasHandles(THCState* state, int device, int numBlasH
THCudaCheck(cudaSetDevice(prevDev));
}
+void THCState_reserveDeviceSparseHandles(THCState* state, int device, int numSparseHandles)
+{
+ int prevDev = -1;
+ THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+ if (numSparseHandles <= res->numSparseHandles) {
+ return;
+ }
+
+ THCudaCheck(cudaGetDevice(&prevDev));
+ THCudaCheck(cudaSetDevice(device));
+
+ size_t size = numSparseHandles * sizeof(cusparseHandle_t);
+ cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size);
+ for (int i = res->numSparseHandles; i < numSparseHandles; ++i) {
+ handles[i] = NULL;
+ THCusparseCheck(cusparseCreate(&handles[i]));
+ }
+ res->sparseHandles = handles;
+ res->numSparseHandles = numSparseHandles;
+
+ THCudaCheck(cudaSetDevice(prevDev));
+}
+
void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
{
// cuBLAS handles are created lazily from THCState_getDeviceBlasHandle
@@ -393,6 +432,16 @@ void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
}
}
+void THCState_reserveSparseHandles(THCState* state, int numSparseHandles)
+{
+ // cuBLAS handles are created lazily from THCState_getDeviceSparseHandle
+ // to avoid initializing unused devices
+ if (numSparseHandles > state->numUserSparseHandles)
+ {
+ state->numUserSparseHandles = numSparseHandles;
+ }
+}
+
int THCState_getNumStreams(THCState* state)
{
return state->numUserStreams;
@@ -403,6 +452,11 @@ int THCState_getNumBlasHandles(THCState* state)
return state->numUserBlasHandles;
}
+int THCState_getNumSparseHandles(THCState* state)
+{
+ return state->numUserSparseHandles;
+}
+
THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
THCState *state, int device)
{
@@ -423,7 +477,7 @@ cudaStream_t THCState_getDeviceStream(THCState *state, int device, int streamInd
}
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
THCStream* stream = res->streams[streamIndex];
- return stream ? stream->stream : NULL;
+ return stream->stream;
}
cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle)
@@ -437,20 +491,37 @@ cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int han
return res->blasHandles[handle - 1];
}
+cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle)
+{
+ if (handle <= 0 || handle > state->numUserSparseHandles) {
+ THError("%d is not a valid handle, valid range is: (1, %d)",
+ handle, state->numUserSparseHandles);
+ }
+ THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+ THCState_reserveDeviceSparseHandles(state, device, handle);
+ return res->sparseHandles[handle - 1];
+}
+
static THCStream* THCState_getStreamOnDevice(THCState* state, int device)
{
- return (THCStream*) THCThreadLocal_get(state->currentStreams[device]);
+ THCThreadLocal local = state->currentStreams[device];
+ THCStream* stream = (THCStream*)THCThreadLocal_get(local);
+ if (!stream) {
+ stream = THCStream_defaultStream(device);
+ THCStream_retain(stream);
+ THCThreadLocal_set(local, stream);
+ }
+ return stream;
}
static void THCState_setStreamOnDevice(THCState *state, int device, THCStream *stream)
{
- if (stream) {
- if (stream->device != device) {
- THError("invalid stream; expected stream for device %d, but was on %d",
- device, stream->device);
- }
- THCStream_retain(stream);
+ THAssert(stream);
+ if (stream->device != device) {
+ THError("invalid stream; expected stream for device %d, but was on %d",
+ device, stream->device);
}
+ THCStream_retain(stream);
THCThreadLocal local = state->currentStreams[device];
THCStream_free((THCStream*)THCThreadLocal_get(local));
THCThreadLocal_set(local, stream);
@@ -459,7 +530,8 @@ static void THCState_setStreamOnDevice(THCState *state, int device, THCStream *s
cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device)
{
THCStream* stream = THCState_getStreamOnDevice(state, device);
- return stream ? stream->stream : NULL;
+ THAssert(stream);
+ return stream->stream;
}
cudaStream_t THCState_getCurrentStream(THCState *state)
@@ -493,12 +565,25 @@ cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
return NULL;
}
+cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state)
+{
+ /* This is called at the point of kernel execution.
+ For some debugging code or improperly instrumented kernels,
+ `state` is null */
+ if (state) {
+ int device;
+ THCudaCheck(cudaGetDevice(&device));
+
+ int handle = THCState_getCurrentSparseHandleIndex(state);
+ return THCState_getDeviceSparseHandle(state, device, handle);
+ }
+ THError("THCState and sparseHandles must be set as there is no default sparseHandle");
+ return NULL;
+}
+
int THCState_getCurrentStreamIndex(THCState *state)
{
THCStream* stream = THCState_getStream(state);
- if (!stream) {
- return 0;
- }
int device;
THCudaCheck(cudaGetDevice(&device));
@@ -521,6 +606,15 @@ int THCState_getCurrentBlasHandleIndex(THCState *state)
return (int) (intptr_t) value;
}
+int THCState_getCurrentSparseHandleIndex(THCState *state)
+{
+ void* value = THCThreadLocal_get(state->currentPerDeviceSparseHandle);
+ if (value == NULL) {
+ return 1;
+ }
+ return (int) (intptr_t) value;
+}
+
THCStream* THCState_getStream(THCState *state)
{
int device;
@@ -544,13 +638,8 @@ void THCState_setCurrentStreamIndex(THCState *state, int streamIndex)
int device;
for (device = 0; device < state->numDevices; ++device) {
- THCStream* stream = NULL;
- if (streamIndex != 0) {
- THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
- stream = res->streams[streamIndex];
- }
-
- THCState_setStreamOnDevice(state, device, stream);
+ THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+ THCState_setStreamOnDevice(state, device, res->streams[streamIndex]);
}
}
@@ -564,6 +653,16 @@ void THCState_setCurrentBlasHandleIndex(THCState *state, int handle)
THCThreadLocal_set(state->currentPerDeviceBlasHandle, (void*)(intptr_t)handle);
}
+void THCState_setCurrentSparseHandleIndex(THCState *state, int handle)
+{
+ if (handle > state->numUserSparseHandles || handle <= 0)
+ {
+ THError("%d is not a valid handle, valid range is: (1, %d)",
+ handle, state->numUserSparseHandles);
+ }
+ THCThreadLocal_set(state->currentPerDeviceSparseHandle, (void*)(intptr_t)handle);
+}
+
void* THCState_getCurrentDeviceScratchSpace(THCState* state)
{
int device = -1;
@@ -668,6 +767,55 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
}
}
+void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line)
+{
+ if(status != CUSPARSE_STATUS_SUCCESS)
+ {
+ const char* errmsg = NULL;
+
+ switch(status)
+ {
+ case CUSPARSE_STATUS_NOT_INITIALIZED:
+ errmsg = "library not initialized";
+ break;
+
+ case CUSPARSE_STATUS_ALLOC_FAILED:
+ errmsg = "resource allocation failed";
+ break;
+
+ case CUSPARSE_STATUS_INVALID_VALUE:
+ errmsg = "an invalid numeric value was used as an argument";
+ break;
+
+ case CUSPARSE_STATUS_ARCH_MISMATCH:
+ errmsg = "an absent device architectural feature is required";
+ break;
+
+ case CUSPARSE_STATUS_MAPPING_ERROR:
+ errmsg = "an access to GPU memory space failed";
+ break;
+
+ case CUSPARSE_STATUS_EXECUTION_FAILED:
+ errmsg = "the GPU program failed to execute";
+ break;
+
+ case CUSPARSE_STATUS_INTERNAL_ERROR:
+ errmsg = "an internal operation failed";
+ break;
+
+ case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+ errmsg = "the matrix type is not supported by this function";
+ break;
+
+ default:
+ errmsg = "unknown error";
+ break;
+ }
+
+ _THError(file, line, "cusparse runtime error : %s", errmsg);
+ }
+}
+
static ptrdiff_t heapSize = 0; // not thread-local
static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6;
static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6;
@@ -700,6 +848,27 @@ cudaError_t THCudaFree(THCState *state, void *ptr)
return allocator->free(allocator->state, ptr);
}
+void* THCudaHostAlloc(THCState *state, size_t size)
+{
+ THCudaCheck(cudaGetLastError());
+ THAllocator* allocator = state->cudaHostAllocator;
+ return allocator->malloc(NULL, size);
+}
+
+void THCudaHostFree(THCState *state, void *ptr)
+{
+ THAllocator* allocator = state->cudaHostAllocator;
+ return allocator->free(NULL, ptr);
+}
+
+void THCudaHostRecord(THCState *state, void *ptr)
+{
+ if (state->cudaHostAllocator == &THCCachingHostAllocator) {
+ THCStream* stream = THCState_getStream(state);
+ THCCachingHostAllocator_recordEvent(ptr, stream);
+ }
+}
+
cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes)
{
size_t cachedBytes = 0;
@@ -768,3 +937,19 @@ void THCHeapUpdate(THCState *state, ptrdiff_t size) {
#include "THCStorage.c"
#include "THCAllocator.c"
+
+/* from THCHalf.h */
+
+half THC_float2half(float f)
+{
+ half h;
+ TH_float2halfbits(&f, &h.x);
+ return h;
+}
+
+float THC_half2float(half h)
+{
+ float f;
+ TH_halfbits2float(&h.x, &f);
+ return f;
+}
diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in
index a88bd7d..f33446d 100644
--- a/lib/THC/THCGeneral.h.in
+++ b/lib/THC/THCGeneral.h.in
@@ -9,6 +9,7 @@
#include "cuda.h"
#include "cuda_runtime.h"
#include "cublas_v2.h"
+#include "cusparse.h"
#cmakedefine USE_MAGMA
@@ -57,8 +58,12 @@ typedef struct _THCCudaResourcesPerDevice {
THCStream** streams;
/* Number of materialized cuBLAS handles */
int numBlasHandles;
+ /* Number of materialized cuSparse handles */
+ int numSparseHandles;
/* cuBLAS handes are lazily initialized */
cublasHandle_t* blasHandles;
+ /* cuSparse handes are lazily initialized */
+ cusparseHandle_t* sparseHandles;
/* Size of scratch space per each stream on this device available */
size_t scratchSpacePerStream;
/* Device-resident scratch space per stream, used for global memory
@@ -72,9 +77,9 @@ struct THCState {
struct THCRNGState* rngState;
struct cudaDeviceProp* deviceProperties;
/* Set of all allocated resources. resourcePerDevice[dev]->streams[0] is NULL,
- which specifies the per-device default stream. blasHandles do not have a
- default and must be explicitly initialized. We always initialize 1
- blasHandle but we can use more.
+ which specifies the per-device default stream. blasHandles and
+ sparseHandles do not have a default and must be explicitly initialized.
+ We always initialize 1 blasHandle and 1 sparseHandle but we can use more.
*/
THCCudaResourcesPerDevice* resourcesPerDevice;
/* Captured number of devices upon startup; convenience for bounds checking */
@@ -82,6 +87,7 @@ struct THCState {
/* Number of Torch defined resources available, indices 1 ... numStreams */
int numUserStreams;
int numUserBlasHandles;
+ int numUserSparseHandles;
/* Allocator using cudaMallocHost. */
THAllocator* cudaHostAllocator;
@@ -91,6 +97,9 @@ struct THCState {
/* Index of the current selected BLAS handle. The actual BLAS handle used
depends on the current device. */
THCThreadLocal/*<int>*/ currentPerDeviceBlasHandle;
+ /* Index of the current selected sparse handle. The actual sparse handle used
+ depends on the current device. */
+ THCThreadLocal/*<int>*/ currentPerDeviceSparseHandle;
/* Array of thread locals containing the current stream for each device */
THCThreadLocal* currentStreams;
@@ -139,7 +148,9 @@ THC_API struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* sta
THC_API struct THCRNGState* THCState_getRngState(THCState* state);
THC_API THAllocator* THCState_getCudaHostAllocator(THCState* state);
THC_API THAllocator* THCState_getCudaUVAAllocator(THCState* state);
+THC_API THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state);
THC_API void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator);
+THC_API int THCState_isCachingAllocatorEnabled(THCState* state);
THC_API void THCMagma_init(THCState *state);
@@ -161,27 +172,42 @@ THC_API void THCState_setCurrentStreamIndex(THCState *state, int stream);
THC_API void THCState_reserveBlasHandles(THCState* state, int numHandles);
THC_API int THCState_getNumBlasHandles(THCState* state);
+THC_API void THCState_reserveSparseHandles(THCState* state, int numHandles);
+THC_API int THCState_getNumSparseHandles(THCState* state);
+
THC_API cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle);
THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state);
THC_API int THCState_getCurrentBlasHandleIndex(THCState *state);
THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle);
+THC_API cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle);
+THC_API cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state);
+THC_API int THCState_getCurrentSparseHandleIndex(THCState *state);
+THC_API void THCState_setCurrentSparseHandleIndex(THCState *state, int handle);
+
/* For the current device and stream, returns the allocated scratch space */
THC_API void* THCState_getCurrentDeviceScratchSpace(THCState* state);
THC_API void* THCState_getDeviceScratchSpace(THCState* state, int device, int stream);
THC_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device);
+#define THCAssertSameGPU(expr) if (!expr) THError("arguments are located on different GPUs")
#define THCudaCheck(err) __THCudaCheck(err, __FILE__, __LINE__)
#define THCudaCheckWarn(err) __THCudaCheckWarn(err, __FILE__, __LINE__)
#define THCublasCheck(err) __THCublasCheck(err, __FILE__, __LINE__)
+#define THCusparseCheck(err) __THCusparseCheck(err, __FILE__, __LINE__)
THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
+THC_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
THC_API cudaError_t THCudaMalloc(THCState *state, void **ptr, size_t size);
THC_API cudaError_t THCudaFree(THCState *state, void *ptr);
+THC_API void* THCudaHostAlloc(THCState *state, size_t size);
+THC_API void THCudaHostFree(THCState *state, void *ptr);
+THC_API void THCudaHostRecord(THCState *state, void *ptr);
+
THC_API cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes);
THC_API void THCSetGCHandler(THCState *state,
void (*torchGCHandlerFunction)(void *data),
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index 023774e..7863260 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -33,96 +33,6 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
in, in + len, out, __half2floatOp());
}
-// FixMe: could call TH_half2float
-// and convert types here, but maybe slower?
-float THC_half2float(half h)
-{
- unsigned sign = ((h.x >> 15) & 1);
- unsigned exponent = ((h.x >> 10) & 0x1f);
- unsigned mantissa = ((h.x & 0x3ff) << 13);
-
- if (exponent == 0x1f) { /* NaN or Inf */
- mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
- exponent = 0xff;
- } else if (!exponent) { /* Denorm or Zero */
- if (mantissa) {
- unsigned int msb;
- exponent = 0x71;
- do {
- msb = (mantissa & 0x400000);
- mantissa <<= 1; /* normalize */
- --exponent;
- } while (!msb);
- mantissa &= 0x7fffff; /* 1.mantissa is implicit */
- }
- } else {
- exponent += 0x70;
- }
-
- int temp = ((sign << 31) | (exponent << 23) | mantissa);
-
- float x;
- memcpy(&x,&temp,sizeof(float));
- return x;
-}
-
-half THC_float2half(float f)
-{
- half ret;
-
- unsigned x;
- memcpy(&x,&f,sizeof(f));
- unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
- unsigned sign, exponent, mantissa;
-
- // Get rid of +NaN/-NaN case first.
- if (u > 0x7f800000) {
- ret.x = 0x7fffU;
- return ret;
- }
-
- sign = ((x >> 16) & 0x8000);
-
- // Get rid of +Inf/-Inf, +0/-0.
- if (u > 0x477fefff) {
- ret.x = sign | 0x7c00U;
- return ret;
- }
- if (u < 0x33000001) {
- ret.x = (sign | 0x0000);
- return ret;
- }
-
- exponent = ((u >> 23) & 0xff);
- mantissa = (u & 0x7fffff);
-
- if (exponent > 0x70) {
- shift = 13;
- exponent -= 0x70;
- } else {
- shift = 0x7e - exponent;
- exponent = 0;
- mantissa |= 0x800000;
- }
- lsb = (1 << shift);
- lsb_s1 = (lsb >> 1);
- lsb_m1 = (lsb - 1);
-
- // Round to nearest even.
- remainder = (mantissa & lsb_m1);
- mantissa >>= shift;
- if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
- ++mantissa;
- if (!(mantissa & 0x3ff)) {
- ++exponent;
- mantissa = 0;
- }
- }
-
- ret.x = (sign | (exponent << 10) | mantissa);
- return ret;
-}
-
THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) {
cudaDeviceProp* prop =
THCState_getCurrentDeviceProperties(state);
diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh
index 0944360..b6d1dac 100644
--- a/lib/THC/THCNumerics.cuh
+++ b/lib/THC/THCNumerics.cuh
@@ -48,7 +48,7 @@ struct THCNumerics<char> {
static inline __host__ __device__ char mul(char a, char b) { return a * b; }
static inline __host__ __device__ char sub(char a, char b) { return a - b; }
static inline __host__ __device__ char div(char a, char b) { return a / b; }
- static inline __host__ __device__ char abs(char a) { return abs(a); }
+ static inline __host__ __device__ char abs(char a) { return ::abs((int)a); }
};
template <>
@@ -67,7 +67,7 @@ struct THCNumerics<short> {
static inline __host__ __device__ short mul(short a, short b) { return a * b; }
static inline __host__ __device__ short sub(short a, short b) { return a - b; }
static inline __host__ __device__ short div(short a, short b) { return a / b; }
- static inline __host__ __device__ short abs(short a) { return abs(a); }
+ static inline __host__ __device__ short abs(short a) { return ::abs((int)a); }
};
template <>
@@ -211,6 +211,19 @@ struct THCNumerics<half> {
#endif
}
+ static inline __host__ __device__ half exp10(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+ return hexp10(a);
+#else
+ float fa = __half2float(a);
+ return __float2half(exp10f(fa));
+#endif
+#else // __CUDA_ARCH__
+ return THC_float2half(exp10f(THC_half2float(a)));
+#endif
+ }
+
static inline __host__ __device__ half log(half a) {
#ifdef __CUDA_ARCH__
#ifdef CUDA_HALF_INSTRUCTIONS
@@ -233,6 +246,15 @@ struct THCNumerics<half> {
#endif
}
+static inline __host__ __device__ half lgamma(half a) {
+#ifdef __CUDA_ARCH__
+ float fa = __half2float(a);
+ return __float2half(lgammaf(fa));
+#else // __CUDA_ARCH__
+ return THC_float2half(lgammaf(THC_half2float(a)));
+#endif
+ }
+
static inline __host__ __device__ half cos(half a) {
#ifdef __CUDA_ARCH__
#ifdef CUDA_HALF_INSTRUCTIONS
@@ -514,7 +536,9 @@ struct THCNumerics<float> {
static inline __host__ __device__ bool eq(float a, float b) { return a == b; }
static inline __host__ __device__ bool ne(float a, float b) { return a != b; }
+ static inline __host__ __device__ float lgamma(float a) { return lgammaf(a);}
static inline __host__ __device__ float exp (float a) { return expf(a); }
+ static inline __host__ __device__ float exp10(float a) { return exp10f(a); }
static inline __host__ __device__ float log (float a) { return logf(a); }
static inline __host__ __device__ float log1p(float a) { return log1pf(a); }
static inline __host__ __device__ float cos (float a) { return cosf(a); }
@@ -557,7 +581,9 @@ struct THCNumerics<double> {
static inline __host__ __device__ bool eq(double a, double b) { return a == b; }
static inline __host__ __device__ bool ne(double a, double b) { return a != b; }
+ static inline __host__ __device__ double lgamma(double a) { return ::lgamma(a);}
static inline __host__ __device__ double exp (double a) { return ::exp(a); }
+ static inline __host__ __device__ double exp10(double a) { return ::exp10(a); }
static inline __host__ __device__ double log (double a) { return ::log(a); }
static inline __host__ __device__ double log1p(double a) { return ::log1p(a); }
static inline __host__ __device__ double cos (double a) { return ::cos(a); }
diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh
index 7f276a2..067d796 100644
--- a/lib/THC/THCReduce.cuh
+++ b/lib/THC/THCReduce.cuh
@@ -168,7 +168,8 @@ bool THC_reduceDim(THCState* state,
const ModifyOp& modifyOp,
const ReduceOp& reduceOp,
typename TensorUtils<TensorType>::DataType init,
- int dim) {
+ int dim,
+ int keepdim) {
ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);
long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim);
@@ -315,6 +316,10 @@ bool THC_reduceDim(THCState* state,
#undef HANDLE_IN_CASE
#undef HANDLE_OUT_CASE
+
+ if (!keepdim) {
+ TensorUtils<TensorType>::squeeze1d(state, out, out, dim);
+ }
return true;
}
diff --git a/lib/THC/THCReduceAll.cuh b/lib/THC/THCReduceAll.cuh
index 9a335c7..1d04e63 100644
--- a/lib/THC/THCReduceAll.cuh
+++ b/lib/THC/THCReduceAll.cuh
@@ -331,7 +331,7 @@ bool THC_reduceAll(THCState* state,
// If our destination is not on the device, copy the value back to
// the host (synchronous!)
if (!outOnDevice) {
- cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost);
+ THCudaCheck(cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost));
}
if (freeDevOut) {
diff --git a/lib/THC/THCReduceApplyUtils.cuh b/lib/THC/THCReduceApplyUtils.cuh
index e365b3a..30325de 100644
--- a/lib/THC/THCReduceApplyUtils.cuh
+++ b/lib/THC/THCReduceApplyUtils.cuh
@@ -19,57 +19,113 @@ __device__ __forceinline__ IndexType getLinearBlockId() {
blockIdx.x;
}
-// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will
-// return the reduced value
-template <typename T, typename ReduceOp>
-__device__ T reduceBlock(T* smem,
- int numVals,
- T threadVal,
- ReduceOp reduceOp,
- T init) {
+// Reduce N values concurrently, i.e. suppose N = 2, and there are 4 threads:
+// (1, 2), (3, 4), (5, 6), (7, 8), then the return in threadVals for thread 0
+// is (1 + 3 + 5 + 7, 2 + 4 + 6 + 8) = (16, 20)
+template <typename T, typename ReduceOp, int N>
+__device__ void reduceNValuesInBlock(T *smem,
+ T threadVals[N],
+ int numVals,
+ ReduceOp reduceOp,
+ T init) {
if (numVals == 0) {
- return init;
+#pragma unroll
+ for (int i = 0; i < N; ++i) {
+ threadVals[i] = init;
+ }
+ return;
}
+ // We store each of the N values contiguously, so if N = 2, all values for
+ // the first threadVal for each thread in the block are stored followed by
+ // all of the values for the second threadVal for each thread in the block
if (threadIdx.x < numVals) {
- smem[threadIdx.x] = threadVal;
+#pragma unroll
+ for (int i = 0; i < N; ++i) {
+ smem[i * numVals + threadIdx.x] = threadVals[i];
+ }
}
-
- // First warp will perform reductions across warps
__syncthreads();
- if ((threadIdx.x / warpSize) == 0) {
- T r = threadIdx.x < numVals ? smem[threadIdx.x] : init;
+
+ // Number of lanes in the final reduction --> this is used to determine
+ // where to put the outputs of each of the n things we are reducing. If
+ // nLP = 32, then we have the 32 outputs for the first threadVal,
+ // followed by the 32 outputs for the second threadVal, etc.
+ int numLanesParticipating = min(numVals, warpSize);
+
+ if (numVals > warpSize && ((threadIdx.x / warpSize) == 0 )) {
+#pragma unroll
+ for (int i = 0; i < N; ++i) {
+ threadVals[i] = threadIdx.x < numVals ? threadVals[i] : init;
+ }
for (int i = warpSize + threadIdx.x; i < numVals; i += warpSize) {
- r = reduceOp(r, smem[i]);
+#pragma unroll
+ for (int j = 0; j < N; ++j) {
+ threadVals[j] = reduceOp(threadVals[j], smem[j * numVals + i]);
+ }
}
- smem[threadIdx.x] = r;
+#pragma unroll
+ for (int i = 0; i < N; ++i) {
+ smem[i * numLanesParticipating + threadIdx.x] = threadVals[i];
+ }
}
-
- // First thread will perform reductions across the block
__syncthreads();
- T r = init;
if (threadIdx.x == 0) {
- r = smem[0];
-
- int numLanesParticipating = min(numVals, warpSize);
-
if (numLanesParticipating == 32) {
- // Unroll for warpSize == 32 and numVals >= 32
#pragma unroll
- for (int i = 1; i < 32; ++i) {
- r = reduceOp(r, smem[i]);
+ for (int i = 0; i < N; ++i) {
+#pragma unroll
+ for (int j = 1; j < 32; ++j) {
+ threadVals[i] = reduceOp(threadVals[i], smem[i * 32 + j]);
+ }
}
} else {
- for (int i = 1; i < numLanesParticipating; ++i) {
- r = reduceOp(r, smem[i]);
+#pragma unroll
+ for (int i = 0; i < N; ++i) {
+ for (int j = 1; j < numLanesParticipating; ++j) {
+ threadVals[i] = reduceOp(threadVals[i], smem[i * numVals + j]);
+ }
}
}
}
+}
+
+// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will
+// return the reduced value
+template <typename T, typename ReduceOp>
+__device__ T reduceBlock(T* smem,
+ int numVals,
+ T threadVal,
+ ReduceOp reduceOp,
+ T init) {
+ reduceNValuesInBlock<T, ReduceOp, 1>(smem, &threadVal, numVals, reduceOp, init);
+ return threadVal;
+}
+
+
+// Block-wide reduction where each thread locally reduces N
+// values before letting a single warp take over - assumes
+// threadVals is in registers, not shared memory
+template <typename T, typename ReduceOp, int N>
+__device__ T reduceBlockWithNThreadLocalReductions(T *smem,
+ T threadVals[N],
+ int numVals,
+ ReduceOp reduceOp,
+ T init) {
+ int offset = threadIdx.x * N;
+ T local = offset < numVals ? threadVals[0] : init;
+
+#pragma unroll
+ for (int i = 1; i < N; ++i) {
+ ++offset;
+ T next = offset < numVals ? threadVals[i] : init;
+ local = reduceOp(local, next);
+ }
- return r;
+ return reduceBlock<T, ReduceOp>(smem, blockDim.x < numVals ? blockDim.x : numVals, local, reduceOp, init);
}
// Make sure the given tensor doesn't have too many dimensions
diff --git a/lib/THC/THCScanUtils.cuh b/lib/THC/THCScanUtils.cuh
index 41a4423..ccf27b7 100644
--- a/lib/THC/THCScanUtils.cuh
+++ b/lib/THC/THCScanUtils.cuh
@@ -5,9 +5,103 @@
// Collection of in-kernel scan / prefix sum utilities
+// Inclusive Scan via an upsweep/downsweep mechanism. Assumes:
+//
+// 1. Power2ScanSize is a power of 2. This code still works for collections that
+// do not exactly contain a power of 2 number of elements, simply round up to the
+// nearest power of 2 and then call.
+//
+// 2. That there are two-elements per thread, i.e. the size of the smem storage
+// is 2 * blockDim.x * sizeof(T).
+//
+// Consider a (+)-Scan on the following elements:
+//
+// Upsweep:
+//
+// 0 1 2 3 4 5 6 7
+// 1 5 9 13
+// 6 22
+// 28
+//
+// Downsweep:
+// 15
+// 3 10 21
+template <typename T, class BinaryOp, int Power2ScanSize>
+__device__ void inclusivePrefixScan(T *smem, BinaryOp binop) {
+ // Reduce step ("upsweep")
+#pragma unroll
+ for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
+ int index = (threadIdx.x + 1) * stride * 2 - 1;
+ if (index < Power2ScanSize) {
+ smem[index] = binop(smem[index], smem[index - stride]);
+ }
+ __syncthreads();
+ }
+
+ // Post-reduce step ("downsweep")
+#pragma unroll
+ for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
+ int index = (threadIdx.x + 1) * stride * 2 - 1;
+ if ((index + stride) < Power2ScanSize) {
+ smem[index + stride] = binop(smem[index + stride], smem[index]);
+ }
+ __syncthreads();
+ }
+}
+
+// Generic Op that can be be used to support segmented scans by re-using
+// the basic inclusiveScanOp. Merely requires that the input data has both
+// a flag and val component
+template <typename T, class BinaryOp>
+struct SegmentedScanOp {
+ __host__ __device__ SegmentedScanOp(BinaryOp binop): _binop(binop) {}
+ __host__ __device__ inline T operator()(const T& a, const T& b) {
+ T c;
+ c.val = a.flag ? a.val : _binop(a.val, b.val);
+ c.flag = a.flag | b.flag;
+ return c;
+ }
+
+ BinaryOp _binop;
+};
+
+// Extends the above Inclusive Scan to support segments. It has the same properties
+// but also takes a flag array that indicates the starts of "segments", i.e. individual
+// units to scan. For example, consider the following (+)-scan that is segmented:
+//
+// Input: [1, 3, 2, 4, 1, 2, 3, 2, 1, 4]
+// Flags: [1, 0, 0, 1, 0, 1, 1, 0, 1, 0]
+// Output: 1 4 6 4 5 2 3 5 1 5
+//
+// So we see that each "flag" resets the scan to that index.
+template <typename T, class BinaryOp, int Power2ScanSize>
+__device__ void segmentedInclusivePrefixScan(T *smem, bool *bmem, BinaryOp binop) {
+ // Reduce step ("upsweep")
+#pragma unroll
+ for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
+ int index = (threadIdx.x + 1) * stride * 2 - 1;
+ if (index < Power2ScanSize) {
+ smem[index] = bmem[index] ? smem[index] : binop(smem[index], smem[index - stride]);
+ bmem[index] = bmem[index] | bmem[index - stride];
+ }
+ __syncthreads();
+ }
+
+ // Post-reduce step ("downsweep")
+#pragma unroll
+ for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
+ int index = (threadIdx.x + 1) * stride * 2 - 1;
+ if ((index + stride) < Power2ScanSize) {
+ smem[index + stride] = bmem[index + stride] ? smem[index + stride] : binop(smem[index + stride], smem[index]);
+ bmem[index + stride] = bmem[index + stride] | bmem[index];
+ }
+ __syncthreads();
+ }
+}
+
// Inclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency>
-__device__ void inclusivePrefixSum(T* smem, T in, T* out) {
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) {
// FIXME: this is a slow, simple implementation; need up/down sweep,
// prevent smem conflicts
smem[threadIdx.x] = in;
@@ -18,7 +112,7 @@ __device__ void inclusivePrefixSum(T* smem, T in, T* out) {
T val = 0;
if (threadIdx.x >= offset) {
- val = smem[threadIdx.x - offset] + smem[threadIdx.x];
+ val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]);
}
__syncthreads();
@@ -38,11 +132,11 @@ __device__ void inclusivePrefixSum(T* smem, T in, T* out) {
}
// Exclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency>
-__device__ void exclusivePrefixSum(T* smem, T in, T* out, T* carry) {
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) {
// FIXME: crappy implementation
// We kill write-after-read dependencies separately below, hence the `false`
- inclusivePrefixSum<T, false>(smem, in, out);
+ inclusivePrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
*out -= in;
*carry = smem[blockDim.x - 1];
@@ -55,8 +149,8 @@ __device__ void exclusivePrefixSum(T* smem, T in, T* out, T* carry) {
// Inclusive prefix sum for binary vars using intra-warp voting +
// shared memory
-template <typename T, bool KillWARDependency>
-__device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
// Within-warp, we use warp voting.
T vote = __ballot(in);
T index = __popc(getLaneMaskLe() & vote);
@@ -77,8 +171,8 @@ __device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
int current = 0;
for (int i = 0; i < blockDim.x / 32; ++i) {
T v = smem[i];
- smem[i] += current;
- current += v;
+ smem[i] = binop(smem[i], current);
+ current = binop(current, v);
}
}
@@ -86,7 +180,7 @@ __device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
// load the carry from the preceding warp
if (warp >= 1) {
- index += smem[warp - 1];
+ index = binop(index, smem[warp - 1]);
}
*out = index;
@@ -98,9 +192,9 @@ __device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
// Exclusive prefix sum for binary vars using intra-warp voting +
// shared memory
-template <typename T, bool KillWARDependency>
-__device__ void exclusiveBinaryPrefixSum(T* smem, bool in, T* out, T* carry) {
- inclusiveBinaryPrefixSum<T, false>(smem, in, out);
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, BinaryFunction binop) {
+ inclusiveBinaryPrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
// Inclusive to exclusive
*out -= (T) in;
diff --git a/lib/THC/THCSortUtils.cu b/lib/THC/THCSortUtils.cu
new file mode 100644
index 0000000..3c4c0f9
--- /dev/null
+++ b/lib/THC/THCSortUtils.cu
@@ -0,0 +1,17 @@
+#include "THCSortUtils.cuh"
+
+// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
+unsigned long nextHighestPowerOf2(unsigned long n) {
+ n--;
+ n |= n >> 1;
+ n |= n >> 2;
+ n |= n >> 4;
+ n |= n >> 8;
+ n |= n >> 16;
+#ifndef _MSC_VER
+ n |= n >> 32;
+#endif
+ n++;
+
+ return n;
+}
diff --git a/lib/THC/THCSortUtils.cuh b/lib/THC/THCSortUtils.cuh
index ec676c0..d5ad237 100644
--- a/lib/THC/THCSortUtils.cuh
+++ b/lib/THC/THCSortUtils.cuh
@@ -41,6 +41,18 @@ __device__ inline void bitonicSwap(K& kA, V& vA, bool& validA,
}
};
+template <typename Comparator, typename K>
+__device__ inline void bitonicSwapKeys(K& kA, bool& validA,
+ K& kB, bool& validB,
+ bool dir,
+ const Comparator& comp) {
+ bool swap = (comp(kA, kB) && validA) || !validB;
+ if (swap == dir) {
+ swapVars(kA, kB);
+ swapVars(validA, validB);
+ }
+}
+
template <typename Comparator, typename K, typename V,
typename IndexType, int Power2SortSize>
__device__ inline void bitonicSort(K keys[Power2SortSize],
@@ -53,12 +65,9 @@ __device__ inline void bitonicSort(K keys[Power2SortSize],
#pragma unroll
for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
-
- // Single warp per slice is completely synchronous
- if (Power2SortSize > 64) {
- __syncthreads();
- }
-
+
+ __syncthreads();
+
unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
bitonicSwap<Comparator, K, V>(
keys[pos], values[pos], valid[pos],
@@ -69,11 +78,9 @@ __device__ inline void bitonicSort(K keys[Power2SortSize],
#pragma unroll
for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
- // Single warp per slice is completely synchronous
- if (Power2SortSize > 64) {
- __syncthreads();
- }
-
+
+ __syncthreads();
+
unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
bitonicSwap<Comparator, K, V>(
keys[pos], values[pos], valid[pos],
@@ -81,10 +88,45 @@ __device__ inline void bitonicSort(K keys[Power2SortSize],
false, comp);
}
- // Single warp per slice is completely synchronous
- if (Power2SortSize > 64) {
+ __syncthreads();
+
+}
+
+template <typename Comparator, typename K,
+ typename IndexType, int Power2SortSize>
+__device__ inline void bitonicSortKeys(K keys[Power2SortSize],
+ bool valid[Power2SortSize],
+ const Comparator& comp) {
+#pragma unroll
+ for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
+ bool flag = ((threadIdx.x & (size / 2)) != 0);
+
+#pragma unroll
+ for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
+
+ __syncthreads();
+
+ unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+ bitonicSwapKeys<Comparator, K>(
+ keys[pos], valid[pos],
+ keys[pos + stride], valid[pos + stride],
+ flag, comp);
+ }
+ }
+
+#pragma unroll
+ for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
__syncthreads();
+
+ unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+ bitonicSwapKeys<Comparator, K>(
+ keys[pos], valid[pos],
+ keys[pos + stride], valid[pos + stride],
+ false, comp);
}
+
+ __syncthreads();
+
}
// Sorts (key, value) pairs (in different tensors) in-place; i.e.,
@@ -168,4 +210,6 @@ bitonicSortKVInPlace(TensorInfo<K, IndexType> keys,
}
}
+unsigned long nextHighestPowerOf2(unsigned long n);
+
#endif // THC_SORT_UTILS_INC
diff --git a/lib/THC/THCStream.c b/lib/THC/THCStream.c
deleted file mode 100644
index e261a51..0000000
--- a/lib/THC/THCStream.c
+++ /dev/null
@@ -1,30 +0,0 @@
-#include "THCStream.h"
-
-#include <cuda_runtime_api.h>
-#include "THAtomic.h"
-
-
-THCStream* THCStream_new(int flags)
-{
- THCStream* self = (THCStream*) malloc(sizeof(THCStream));
- self->refcount = 1;
- THCudaCheck(cudaGetDevice(&self->device));
- THCudaCheck(cudaStreamCreateWithFlags(&self->stream, flags));
- return self;
-}
-
-void THCStream_free(THCStream* self)
-{
- if (!self) {
- return;
- }
- if (THAtomicDecrementRef(&self->refcount)) {
- THCudaCheck(cudaStreamDestroy(self->stream));
- free(self);
- }
-}
-
-void THCStream_retain(THCStream* self)
-{
- THAtomicIncrementRef(&self->refcount);
-}
diff --git a/lib/THC/THCStream.cpp b/lib/THC/THCStream.cpp
new file mode 100644
index 0000000..49fe680
--- /dev/null
+++ b/lib/THC/THCStream.cpp
@@ -0,0 +1,60 @@
+#include "THCStream.h"
+
+#include <mutex>
+#include <cuda_runtime_api.h>
+#include "THAtomic.h"
+
+#define MAX_DEVICES 256
+static THCStream default_streams[MAX_DEVICES];
+
+static void initialize_default_streams()
+{
+ for (int i = 0; i < MAX_DEVICES; i++) {
+ default_streams[i].device = i;
+ }
+}
+
+THCStream* THCStream_new(int flags)
+{
+ THCStream* self = (THCStream*) malloc(sizeof(THCStream));
+ self->refcount = 1;
+ THCudaCheck(cudaGetDevice(&self->device));
+ THCudaCheck(cudaStreamCreateWithFlags(&self->stream, flags));
+ return self;
+}
+
+THC_API THCStream* THCStream_defaultStream(int device)
+{
+ // default streams aren't refcounted
+ THAssert(device >= 0 && device < MAX_DEVICES);
+ std::once_flag once;
+ std::call_once(once, &initialize_default_streams);
+ return &default_streams[device];
+}
+
+THCStream* THCStream_newWithPriority(int flags, int priority)
+{
+ THCStream* self = (THCStream*) malloc(sizeof(THCStream));
+ self->refcount = 1;
+ THCudaCheck(cudaGetDevice(&self->device));
+ THCudaCheck(cudaStreamCreateWithPriority(&self->stream, flags, priority));
+ return self;
+}
+
+void THCStream_free(THCStream* self)
+{
+ if (!self || !self->stream) {
+ return;
+ }
+ if (THAtomicDecrementRef(&self->refcount)) {
+ THCudaCheckWarn(cudaStreamDestroy(self->stream));
+ free(self);
+ }
+}
+
+void THCStream_retain(THCStream* self)
+{
+ if (self->stream) {
+ THAtomicIncrementRef(&self->refcount);
+ }
+}
diff --git a/lib/THC/THCStream.h b/lib/THC/THCStream.h
index de3f64e..6ccb057 100644
--- a/lib/THC/THCStream.h
+++ b/lib/THC/THCStream.h
@@ -13,6 +13,8 @@ struct THCStream
THC_API THCStream* THCStream_new(int flags);
+THC_API THCStream* THCStream_defaultStream(int device);
+THC_API THCStream* THCStream_newWithPriority(int flags, int priority);
THC_API void THCStream_free(THCStream* self);
THC_API void THCStream_retain(THCStream* self);
diff --git a/lib/THC/THCTensorConv.cu b/lib/THC/THCTensorConv.cu
index 71aac03..c8c1ad6 100644
--- a/lib/THC/THCTensorConv.cu
+++ b/lib/THC/THCTensorConv.cu
@@ -296,7 +296,7 @@ __global__ void conv2genericrev(float *input, float *kernel, float *output,
THC_API void THCudaTensor_conv2Dmv(THCState *state, THCudaTensor *output, float beta, THCudaTensor *input,
THCudaTensor *kernel, long srow, long scol, const char *type)
{
- THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel));
+ THCAssertSameGPU(THCudaTensor_checkGPU(state, 3, output, input, kernel));
long nInputPlane, nInputRows, nInputCols;
long nKernelRows, nKernelCols;
long nOutputPlane, nOutputRows, nOutputCols;
@@ -416,7 +416,7 @@ THC_API void THCudaTensor_conv2Dmv(THCState *state, THCudaTensor *output, float
THC_API void THCudaTensor_conv2Dmm(THCState *state, THCudaTensor *output, float beta, THCudaTensor *input,
THCudaTensor *kernel, long srow, long scol, const char *type)
{
- THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel));
+ THCAssertSameGPU(THCudaTensor_checkGPU(state, 3, output, input, kernel));
long nbatch, nInputPlane, nInputRows, nInputCols;
long nKernelRows, nKernelCols;
long nOutputPlane, nOutputRows, nOutputCols;
@@ -549,7 +549,7 @@ THC_API void THCudaTensor_conv2DRevger(THCState *state, THCudaTensor *output, fl
THCudaTensor *input, THCudaTensor *kernel,
long srow, long scol)
{
- THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel));
+ THCAssertSameGPU(THCudaTensor_checkGPU(state, 3, output, input, kernel));
long nInputPlane, nInputRows, nInputCols;
long nKernelPlane, nKernelRows, nKernelCols;
long nOutputRows, nOutputCols;
@@ -883,7 +883,7 @@ THC_API void THCudaTensor_conv2Dmap(THCState *state, THCudaTensor *output, THCud
THCudaTensor *kernel, long stride_x, long stride_y,
THCudaTensor *table, long fanin)
{
- THAssert(THCudaTensor_checkGPU(state, 4, output, input, kernel, table));
+ THCAssertSameGPU(THCudaTensor_checkGPU(state, 4, output, input, kernel, table));
long nInputPlane, nInputRows, nInputCols;
long nKernelRows, nKernelCols;
long nOutputPlane, nOutputRows, nOutputCols;
diff --git a/lib/THC/THCTensorCopy.h b/lib/THC/THCTensorCopy.h
index e8bc4f4..74f2b59 100644
--- a/lib/THC/THCTensorCopy.h
+++ b/lib/THC/THCTensorCopy.h
@@ -4,6 +4,7 @@
#include "THCTensor.h"
#include "THCGeneral.h"
#include "THCHalf.h"
+#include "THCStream.h"
#include "generic/THCTensorCopy.h"
#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMath.cu b/lib/THC/THCTensorMath.cu
index 41e6466..b9225fe 100644
--- a/lib/THC/THCTensorMath.cu
+++ b/lib/THC/THCTensorMath.cu
@@ -107,6 +107,32 @@ struct NonZeroOp
}
};
+template<typename T, typename accT = T>
+struct LinspaceOp {
+ __host__ __device__ LinspaceOp(accT start, accT step):
+ start_(start), step_(step) { }
+ __device__ __forceinline__ T operator()(ptrdiff_t index) {
+ accT increment = THCNumerics<accT>::mul(step_, ScalarConvert<ptrdiff_t,accT>::to(index));
+ accT value = THCNumerics<accT>::add(start_, increment);
+ return ScalarConvert<accT,T>::to(value);
+ }
+
+ const accT start_, step_;
+};
+
+template<typename T, typename accT = T>
+struct LogspaceOp {
+ __host__ __device__ LogspaceOp(accT start, accT step):
+ start_(start), step_(step) { }
+ __device__ __forceinline__ T operator()(ptrdiff_t index) {
+ accT increment = THCNumerics<accT>::mul(step_, ScalarConvert<ptrdiff_t,accT>::to(index));
+ accT value = THCNumerics<accT>::exp10(THCNumerics<accT>::add(start_, increment));
+ return ScalarConvert<accT,T>::to(value);
+ }
+
+ const accT start_, step_;
+};
+
#include "generic/THCTensorMath.cu"
#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h
index 19ae679..b888672 100644
--- a/lib/THC/THCTensorMath.h
+++ b/lib/THC/THCTensorMath.h
@@ -43,6 +43,12 @@
#include "generic/THCTensorSort.h"
#include "THCGenerateAllTypes.h"
+#include "generic/THCTensorMode.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorTopK.h"
+#include "THCGenerateAllTypes.h"
+
THC_API int THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self);
THC_API int THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self);
diff --git a/lib/THC/THCTensorMath2.cu b/lib/THC/THCTensorMath2.cu
index 7e6af9b..aaee332 100644
--- a/lib/THC/THCTensorMath2.cu
+++ b/lib/THC/THCTensorMath2.cu
@@ -16,7 +16,7 @@ struct TensorATan2Op {
void THCudaTensor_atan2(THCState *state, THCudaTensor *self_, THCudaTensor *tx, THCudaTensor *ty)
{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, tx, ty));
+ THCAssertSameGPU(THCudaTensor_checkGPU(state, 3, self_, tx, ty));
THArgCheck(THCudaTensor_nElement(state, tx) ==
THCudaTensor_nElement(state, ty), 3, "sizes do not match");
THCudaTensor_resizeAs(state, self_, tx);
diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu
index 094cf0b..efefd76 100644
--- a/lib/THC/THCTensorMathPairwise.cu
+++ b/lib/THC/THCTensorMathPairwise.cu
@@ -244,11 +244,17 @@ template <typename T>
struct TensorRemainderOp {
TensorRemainderOp(T v) : val(v) {}
__device__ __forceinline__ void operator()(T* out, T* in) {
- *out = *in - val * (*in / val);
+ *out = *in % val;
+ if ((*out * val) < 0){
+ *out += val;
+ }
}
__device__ __forceinline__ void operator()(T* v) {
- *v = *v - val * (*v / val);
+ *v = *v % val;
+ if ((*v * val) < 0){
+ *v += val;
+ }
}
const T val;
@@ -399,5 +405,75 @@ struct TensorTriOp {
const long stride0, stride1, k;
};
+template <typename T>
+struct TensorLShiftConstantOp {
+ TensorLShiftConstantOp(T v) : val(v) {}
+ __device__ __forceinline__ void operator()(T* out, T* in) {
+ *out = *in << val;
+ }
+
+ __device__ __forceinline__ void operator()(T* v) {
+ *v <<= val;
+ }
+
+ const T val;
+};
+
+template <typename T>
+struct TensorRShiftConstantOp {
+ TensorRShiftConstantOp(T v) : val(v) {}
+ __device__ __forceinline__ void operator()(T* out, T* in) {
+ *out = *in >> val;
+ }
+
+ __device__ __forceinline__ void operator()(T* v) {
+ *v >>= val;
+ }
+
+ const T val;
+};
+
+template <typename T>
+struct TensorBitAndConstantOp {
+ TensorBitAndConstantOp(T v) : val(v) {}
+ __device__ __forceinline__ void operator()(T* out, T* in) {
+ *out = *in & val;
+ }
+
+ __device__ __forceinline__ void operator()(T* v) {
+ *v &= val;
+ }
+
+ const T val;
+};
+
+template <typename T>
+struct TensorBitOrConstantOp {
+ TensorBitOrConstantOp(T v) : val(v) {}
+ __device__ __forceinline__ void operator()(T* out, T* in) {
+ *out = *in | val;
+ }
+
+ __device__ __forceinline__ void operator()(T* v) {
+ *v |= val;
+ }
+
+ const T val;
+};
+
+template <typename T>
+struct TensorBitXorConstantOp {
+ TensorBitXorConstantOp(T v) : val(v) {}
+ __device__ __forceinline__ void operator()(T* out, T* in) {
+ *out = *in ^ val;
+ }
+
+ __device__ __forceinline__ void operator()(T* v) {
+ *v ^= val;
+ }
+
+ const T val;
+};
+
#include "generic/THCTensorMathPairwise.cu"
#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh
index de96cad..6ab010a 100644
--- a/lib/THC/THCTensorMathPointwise.cuh
+++ b/lib/THC/THCTensorMathPointwise.cuh
@@ -415,11 +415,17 @@ struct TensorDivOp<half> {
template <typename T>
struct TensorCRemainderOp {
__device__ __forceinline__ void operator()(T* out, T* in) {
- *out = *in != 0 ? *out - *in * (*out / *in) : NAN;
+ *out = *out % *in;
+ if ((*out * *in)<0){
+ *out += *in;
+ }
}
__device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
- *out = *in2 != 0 ? *in1 - *in2 * (*in1 / *in2) : NAN;
+ *out = *in1 % *in2;
+ if ((*out * *in2)<0){
+ *out += *in2;
+ }
}
};
@@ -660,4 +666,123 @@ struct TensorAddCDivOp {
T val;
};
+template <typename T>
+struct TensorLShiftOp {
+ __device__ __forceinline__ void
+ operator()(T* out, T* in) {
+ *out <<= *in;
+ }
+
+ __device__ __forceinline__ void
+ operator()(T* out, T* in1, T* in2) {
+ *out = *in1 << *in2;
+ }
+};
+
+template <>
+struct TensorLShiftOp<float> {
+ __device__ __forceinline__ void
+ operator()(float* out, float* in) {
+ *out *= powf(2.0f, *in);
+ }
+
+ __device__ __forceinline__ void
+ operator()(float* out, float* in1, float* in2) {
+ *out = *in1 * powf(2.0f, *in2);
+ }
+};
+
+template <>
+struct TensorLShiftOp<double> {
+ __device__ __forceinline__ void
+ operator()(double* out, double* in) {
+ *out *= pow(2.0, *in);
+ }
+
+ __device__ __forceinline__ void
+ operator()(double* out, double* in1, double* in2) {
+ *out = *in1 * pow(2.0, *in2);
+ }
+};
+
+template <typename T>
+struct TensorRShiftOp {
+ __device__ __forceinline__ void
+ operator()(T* out, T* in) {
+ *out >>= *in;
+ }
+
+ __device__ __forceinline__ void
+ operator()(T* out, T* in1, T* in2) {
+ *out = *in1 >> *in2;
+ }
+};
+
+
+template <>
+struct TensorRShiftOp<float> {
+ __device__ __forceinline__ void
+ operator()(float* out, float* in) {
+ *out /= powf(2.0f, *in);
+ }
+
+ __device__ __forceinline__ void
+ operator()(float* out, float* in1, float* in2) {
+ *out = *in1 / powf(2.0f, *in2);
+ }
+};
+
+template <>
+struct TensorRShiftOp<double> {
+ __device__ __forceinline__ void
+ operator()(double* out, double* in) {
+ *out /= pow(2.0, *in);
+ }
+
+ __device__ __forceinline__ void
+ operator()(double* out, double* in1, double* in2) {
+ *out = *in1 / pow(2.0, *in2);
+ }
+};
+
+template <typename T>
+struct TensorBitAndOp {
+ __device__ __forceinline__ void
+ operator()(T* out, T* in) {
+ *out &= *in;
+ }
+
+ __device__ __forceinline__ void
+ operator()(T* out, T* in1, T* in2) {
+ *out = *in1 & *in2;
+ }
+};
+
+template <typename T>
+struct TensorBitOrOp {
+ __device__ __forceinline__ void
+ operator()(T* out, T* in) {
+ *out |= *in;
+ }
+
+ __device__ __forceinline__ void
+ operator()(T* out, T* in1, T* in2) {
+ *out = *in1 | *in2;
+ }
+};
+
+template <typename T>
+struct TensorBitXorOp {
+ __device__ __forceinline__ void
+ operator()(T* out, T* in) {
+ *out ^= *in;
+ }
+
+ __device__ __forceinline__ void
+ operator()(T* out, T* in1, T* in2) {
+ *out = *in1 ^ *in2;
+ }
+};
+
+
#endif // THC_TENSORMATH_POINTWISE_CUH
diff --git a/lib/THC/THCTensorMathReduce.cu b/lib/THC/THCTensorMathReduce.cu
index 446daec..1025366 100644
--- a/lib/THC/THCTensorMathReduce.cu
+++ b/lib/THC/THCTensorMathReduce.cu
@@ -2,7 +2,7 @@
THC_API int
THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self) {
- THAssert(THCudaByteTensor_checkGPU(state, 1, self));
+ THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 1, self));
unsigned char result;
if (!THC_reduceAll(state, self,
thrust::identity<unsigned char>(),
@@ -17,7 +17,7 @@ THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self) {
THC_API int
THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self) {
- THAssert(THCudaByteTensor_checkGPU(state, 1, self));
+ THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 1, self));
unsigned char result;
if (!THC_reduceAll(state, self,
thrust::identity<unsigned char>(),
diff --git a/lib/THC/THCTensorMathReduce.cuh b/lib/THC/THCTensorMathReduce.cuh
index 5fefbab..5051fbe 100644
--- a/lib/THC/THCTensorMathReduce.cuh
+++ b/lib/THC/THCTensorMathReduce.cuh
@@ -469,8 +469,8 @@ kernelTransformReduceOuterDimIndex(K *tgt1,
for (unsigned col = 0; col < row_size; ++col) {
// +1 for Lua index
- acc = binary_op(thrust::make_pair<K, Index>(*src, col + TH_INDEX_BASE),
- acc);
+ acc = binary_op(acc,
+ thrust::make_pair<K, Index>(*src, col + TH_INDEX_BASE));
src += num_irows;
}
@@ -550,7 +550,7 @@ kernelTransformReduceInnermostDimIndex(K *tgt1,
K *src = src_ + row * row_size;
// Sequential reduction within a thread.
for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) {
- acc = binary_op(thrust::make_pair<K, Index>(src[col], col + TH_INDEX_BASE), acc);
+ acc = binary_op(acc, thrust::make_pair<K, Index>(src[col], col + TH_INDEX_BASE));
}
}
@@ -625,6 +625,7 @@ THC_reduceDimIndex(THCState *state,
TensorTypeIndex *tgt2_,
TensorTypeK *src,
long dimension,
+ int keepdim,
const thrust::pair<
typename TensorUtils<TensorTypeK>::DataType,
typename TensorUtils<TensorTypeIndex>::DataType>& init,
@@ -653,6 +654,10 @@ THC_reduceDimIndex(THCState *state,
TensorUtils<TensorTypeK>::free(state, src);
TensorUtils<TensorTypeK>::freeCopyTo(state, tgt1, tgt1_);
TensorUtils<TensorTypeIndex>::freeCopyTo(state, tgt2, tgt2_);
+ if (!keepdim) {
+ TensorUtils<TensorTypeK>::squeeze1d(state, tgt1_, tgt1_, dimension);
+ TensorUtils<TensorTypeIndex>::squeeze1d(state, tgt2_, tgt2_, dimension);
+ }
}
template <typename T, typename Index>
diff --git a/lib/THC/THCTensorMathScan.cu b/lib/THC/THCTensorMathScan.cu
index 3345e25..6f01bd2 100644
--- a/lib/THC/THCTensorMathScan.cu
+++ b/lib/THC/THCTensorMathScan.cu
@@ -6,6 +6,8 @@
#include "THCReduce.cuh"
#include "THCNumerics.cuh"
#include "THCTensorMathReduce.cuh"
+#include <thrust/scan.h>
+#include <thrust/execution_policy.h>
/* Perform an inclusive scan along an outer dimension of a tensor.
*
@@ -20,8 +22,8 @@
*/
template<typename T, class BinaryOp>
__global__ void THCTensor_kernel_scanOuterDim(T *tgt_, T *src_,
- unsigned num_orows, unsigned num_irows, unsigned row_size,
- T init, BinaryOp binary_op)
+ unsigned num_orows, unsigned num_irows, unsigned row_size,
+ T init, BinaryOp binary_op)
{
for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
@@ -52,8 +54,8 @@ __global__ void THCTensor_kernel_scanOuterDim(T *tgt_, T *src_,
*/
template<typename T, int num_threads_x, int num_threads_y, class BinaryFunction>
__global__ void THCTensor_kernel_scanInnermostDim(T *tgt_, T *src_,
- unsigned num_rows, unsigned row_size,
- T init, BinaryFunction binary_op)
+ unsigned num_rows, unsigned row_size,
+ T init, BinaryFunction binary_op)
{
__shared__ T sbuf[num_threads_y][2 * num_threads_x];
diff --git a/lib/THC/THCTensorMode.cu b/lib/THC/THCTensorMode.cu
new file mode 100644
index 0000000..aa6c628
--- /dev/null
+++ b/lib/THC/THCTensorMode.cu
@@ -0,0 +1,16 @@
+#include "THC.h"
+#include "THCThrustAllocator.cuh"
+#include "THCTensorTypeUtils.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/inner_product.h>
+#include <thrust/device_vector.h>
+#include <thrust/extrema.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include "THCTensorMode.cuh"
+
+#include "generic/THCTensorMode.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMode.cuh b/lib/THC/THCTensorMode.cuh
new file mode 100644
index 0000000..b67ac2a
--- /dev/null
+++ b/lib/THC/THCTensorMode.cuh
@@ -0,0 +1,282 @@
+#ifndef THC_TENSOR_MODE_CUH
+#define THC_TENSOR_MODE_CUH
+
+#include "THCNumerics.cuh"
+#include "THCSortUtils.cuh"
+#include "THCScanUtils.cuh"
+
+struct ThrustHalfLess
+{
+ __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
+ return THCNumerics<half>::lt(lhs, rhs);
+ }
+};
+
+struct ThrustHalfNotEqualTo
+{
+ __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
+ return THCNumerics<half>::ne(lhs, rhs);
+ }
+};
+
+struct ThrustHalfEqualTo
+{
+ __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
+ return THCNumerics<half>::eq(lhs, rhs);
+ }
+};
+
+struct ThrustHalfEqualToPredicate
+{
+ ThrustHalfEqualToPredicate(half val): val_(val) {}
+ __host__ __device__ inline bool operator()(half x) {
+ return THCNumerics<half>::eq(val_, x);
+ }
+
+ half val_;
+};
+
+template <typename T>
+struct BinaryAddOp {
+ __host__ __device__ inline T operator()(const T a, const T b) {
+ return THCNumerics<T>::add(a, b);
+ }
+};
+
+template <>
+struct BinaryAddOp<unsigned int> {
+ __host__ __device__ inline unsigned int operator()(const unsigned int a, const unsigned int b) {
+ return a + b;
+ }
+};
+
+// Used for a segmented reduction
+struct ModeUnsignedBoolPair {
+ unsigned int val;
+ bool flag;
+};
+
+// In the kernel below, we have a common pattern of reducing (unsigned int, unsigned int)
+// pairs of data
+struct ModeUnsignedPair {
+ unsigned int val;
+ unsigned int index;
+};
+
+template <typename T>
+struct MaxReduceOp {
+ __host__ __device__ inline T operator()(const T& a, const T& b) {
+ return b.val > a.val ? b : a;
+ }
+};
+
+template <typename T>
+struct MatchReduceOp {
+ __host__ __device__ inline T operator()(const T& a, const T& b) {
+ return b.flag ? b : a;
+ }
+};
+
+// The mode kernel has the following characteristics: It uses internal shared memory
+// buffers of Power2Size, which must be greater than the number of elements. Additionally,
+// there is one block for every slice to calculate the mode for, and in each block there
+// is one thread for every two elements.
+//
+// Both sorted and positions are assumed to be contiguous Tensors with the mode dimension
+// as the innermost dim, such that we can get the particular slice for a Tensor via its
+// linear block dimension * the slice size.
+template <typename T, unsigned int Power2Size>
+__global__ void computeMode(
+ T *input,
+ TensorInfo<T, unsigned int> values,
+ TensorInfo<long, unsigned int> indices,
+ long sliceSize)
+{
+ int tidx = threadIdx.x;
+ int stidx = blockDim.x + threadIdx.x; // Second index this thread responsible for
+
+ // First, we need to calculate the offset into the sorted Tensor that represents
+ // the start of the slice for this block to calculate the mode for. This offset
+ // is a combination of the gridIndices, and the number of elements in the slice.
+ unsigned int blockId = getLinearBlockId<unsigned int>();
+ unsigned int linearOffset = blockId * sliceSize;
+
+ // shmem is a dynamically sized buffer we will use throughout the kernel to
+ // handle computation efficiently. The size of this shmem must be
+ // sizeof(T) * Power2Size + (2 * sizeof(unsigned int) * Power2Size)
+ //
+ // Initially, the buffer will be organized as follows:
+ //
+ // [smem (slice elements) | bmem (valid indices) | <scratch space>]
+ extern __shared__ char shmem[];
+
+ // smem represents a proportion of the shared memory buffer that is used to store
+ // the elements from the slice:
+ T *smem = reinterpret_cast<T *>(shmem);
+
+ // Each thread loads up to two elements from the Tensor into shared memory
+ if (tidx < sliceSize) {
+ smem[tidx] = input[linearOffset + tidx];
+ }
+ if (stidx < sliceSize) {
+ smem[stidx] = input[linearOffset + stidx];
+ }
+
+ // Next, we initialize a boolean region of the buffer, offset by the loaded element
+ // smem region
+ bool *bmem = reinterpret_cast<bool *>(&smem[Power2Size]);
+
+ // The first use of this region stores bmem[i] = i < sliceSize to mark the valid
+ // components in the smem buffer
+ bmem[tidx] = tidx < sliceSize;
+ bmem[stidx] = stidx < sliceSize;
+ __syncthreads(); // barrier for smem, bmem initialization
+
+ // First, sort the input slice in ascending order. smem contains the input
+ // elements, and bmem marks the valid indices
+ bitonicSortKeys<LTComp<T>, T, unsigned int, Power2Size>(smem, bmem, LTComp<T>());
+ __syncthreads(); // make no assumptions that the sort syncs at end
+
+ // The next step of our algorithm is performing a block-wide comparison of
+ // neighboring elements. In particular, given an sorted input slice A, we
+ // produce an output slice B, such that B[i] = 1 if A[i-i] != A[i], otherwise 0.
+ //
+ // Given the input A = [0, 0, 1, 1, 2, 2, 2, 4, 5, 6, 6, 7, 8]
+ // B = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+ //
+ // In particular, we can think of B[i] true indicating the start of a sequence of
+ // equal values in the sorted list. Similarly, we will also store the negation of B,
+ // which we'll call C. In particular, we can think of C[i] = true iff A[i-1] == A[i]
+ // in our original sorted slice.
+ //
+ // C = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+
+ // We overwrite bmem, and treat the rest of shared memory as a buffer of (index, flag) pairs
+ // where the index represents values from C, and the flag represents values from B.
+ //
+ // [smem (sorted slice) | ubpmem (index, flag pairs)]
+
+ struct ModeUnsignedBoolPair *ubpmem = reinterpret_cast<struct ModeUnsignedBoolPair *>(
+ &smem[Power2Size]);
+
+ if (tidx == 0) {
+ ubpmem[0].flag = true;
+ ubpmem[0].val = 0;
+ }
+
+ // Compares elements (0, 1), (2, 3), ... and sets 1, 3, ...
+ ubpmem[tidx * 2 + 1].flag = THCNumerics<T>::ne(smem[tidx * 2], smem[tidx * 2 + 1]); // (0, 1), (1, 2), etc.
+ ubpmem[tidx * 2 + 1].val = !ubpmem[tidx * 2 + 1].flag;
+
+ // Compares elements (1, 2), (3, 4), ... and sets 2, 4, ...
+ if (((tidx + 1) * 2) < Power2Size) {
+ ubpmem[(tidx + 1) * 2].flag = THCNumerics<T>::ne(smem[((tidx + 1) * 2) - 1], smem[(tidx + 1) * 2]);
+ ubpmem[(tidx + 1) * 2].val = !ubpmem[(tidx + 1) * 2].flag;
+ }
+ __syncthreads(); // barrier for ubpmem initialization
+
+ // Next, we perform a segmented prefix sum on the neighboring elements, where
+ // the presence of a one indicates the start of a segment. In this case B acts
+ // as the segment start flags, and C is the buffer to be summed:
+ //
+ // Input (C) = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+ // Flag (B) = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+ // Output (C) = [0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0]
+ //
+ // Afterwards, the (index) components of the ubpmem buffer contain the lengths of the
+ // segments (minus 1), i.e. the counts of each element in the original input.
+
+ inclusivePrefixScan<
+ struct ModeUnsignedBoolPair,
+ struct SegmentedScanOp<struct ModeUnsignedBoolPair, BinaryAddOp<unsigned int> >,
+ Power2Size>(
+ ubpmem,
+ SegmentedScanOp<struct ModeUnsignedBoolPair, BinaryAddOp<unsigned int> >(BinaryAddOp<unsigned int>()));
+ // assumes scan syncs at the end
+
+ // Next, we reinterpret the ubpmem buffer as pairs of unsigned integers (i.e. we treat the
+ // boolean flag regions as integers). We initialize these to represent indices, and we'll call
+ // this buffer I
+ struct ModeUnsignedPair *uupmem = reinterpret_cast<struct ModeUnsignedPair *>(ubpmem);
+
+ // At this point, we need to find the maximum element in lengths buffer C.
+ // This element will represent the count (-1) of the mode. Because of the
+ // way we have set up the problem, the index where this mode occurs will
+ // also be the location of the mode value in the sorted array, e.g.
+ //
+ // smem = [0, 0, 1, 1, 1, 2]
+ // C = [0, 1, 0, 1, 2, 0]
+ // I = [0, 1, 2, 3, 4, 5]
+ // ^
+ // maximum value, also aligned with mode = 1
+ //
+ // We perform a block wide max-reduction of the C buffer, but we also need the
+ // indices to come along with it, so we utilize the uupmem construction.
+ //
+ // At the end we need to return the ModeUnsignedPair containing index = 4, val = 2,
+ // which represents the max
+
+ // In practice, we will make each thread locally reduce 2 values in its registers prior
+ // to the global block-wide reduction. Note that instead of tidx/stidx, we utilize tidx * 2,
+ // tidx * 2 + 1, so each thread deals with adjacent elements. This is because the reduce
+ // code below relies on thread elements to be adjacent.
+ struct ModeUnsignedPair uup[2];
+ uup[0].index = tidx * 2;
+ uup[0].val = ubpmem[tidx * 2].val;
+ uup[1].index = tidx * 2 + 1;
+ uup[1].val = ubpmem[tidx * 2 + 1].val;
+ __syncthreads();
+
+ struct ModeUnsignedPair max = {0, 0};
+
+ max = reduceBlockWithNThreadLocalReductions<struct ModeUnsignedPair, MaxReduceOp<struct ModeUnsignedPair>, 2>
+ (uupmem, uup, sliceSize, MaxReduceOp<struct ModeUnsignedPair>(), max);
+
+ // Store the mode in shared memory for use in finding the mode in the input slice
+ __shared__ T mode;
+
+ // Given the above constraints, the mode is the value at the reduced index in the
+ // original sorted element buffer
+ if (tidx == 0) {
+ mode = smem[max.index];
+ }
+ __syncthreads(); // broadcast mode
+
+ // Finally, we need to find the "an" index of the mode in the input Tensor. The API does
+ // not constrain which index we pick, so it can be any of the indices that contain the mode.
+ // We will do a reduction to find the index. We go back to using the (index, flag) buffer
+ // arrangment. First, we mark indices that are equal to the mode, i.e B[i] = true if
+ // input[i] == mode, and initialize C[i] to be the index
+ //
+ // Again we reduce 2 elements in the thread's registers prior to the block-wide reduction
+ struct ModeUnsignedBoolPair ubpp[2];
+ if (tidx * 2 < sliceSize) {
+ ubpp[0].flag = THCNumerics<T>::eq(input[linearOffset + (tidx * 2)], mode);
+ ubpp[0].val = tidx * 2;
+ }
+ if (tidx * 2 + 1 < sliceSize) {
+ ubpp[1].flag = THCNumerics<T>::eq(input[linearOffset + (tidx * 2 + 1)], mode);
+ ubpp[1].val = tidx * 2 + 1;
+ }
+
+ // Then we perform a similar reduction to the one above, except this time we update
+ // the element if the element at the base position is not equal to the mode and
+ // the element at the offset position is. At the end, C[0] will contain an index
+ // with the mode.
+ struct ModeUnsignedBoolPair match = {0, false};
+
+ match = reduceBlockWithNThreadLocalReductions<struct ModeUnsignedBoolPair, MatchReduceOp<struct ModeUnsignedBoolPair>, 2>
+ (ubpmem, ubpp, sliceSize, MatchReduceOp<struct ModeUnsignedBoolPair>(), match);
+
+ // Finally, we have the mode, and an index where it occurs. We use a single thread
+ // to place this in the appropriate output position
+ if (tidx == 0) {
+ long index = TH_INDEX_BASE + match.val;
+
+ unsigned int outputOffset = IndexToOffset<T, unsigned int, -1>::get(blockId, values);
+ values.data[outputOffset] = mode;
+ indices.data[outputOffset] = index;
+ }
+}
+
+#endif // THC_TENSOR_MODE_CUH
diff --git a/lib/THC/THCTensorRandom.cuh b/lib/THC/THCTensorRandom.cuh
index d78409f..5afd8fe 100644
--- a/lib/THC/THCTensorRandom.cuh
+++ b/lib/THC/THCTensorRandom.cuh
@@ -97,44 +97,56 @@ __device__ int binarySearchForMultinomial(T* dist,
return start;
}
-template <typename T>
+template <typename T, typename AccT>
__global__ void
sampleMultinomialOnce(long* dest,
long distributions,
int categories,
T* sampled,
T* dist) {
- extern __shared__ __align__(sizeof(T)) unsigned char my_smem[];
+ extern __shared__ __align__(sizeof(AccT)) unsigned char my_smem[];
+ __shared__ bool found;
+
+ // Shared Memory hold blockdim.x T for holding the cumulative sum,
+ // blockDim.x AccT for normalizing the probabilities,
T *smem = reinterpret_cast<T *>(my_smem);
+ AccT *asmem = reinterpret_cast<AccT *>(&my_smem[blockDim.x * sizeof(T)]);
+
+ AccT accZero = ScalarConvert<int, AccT>::to(0);
T zero = ScalarConvert<int, T>::to(0);
for (long curDist = blockIdx.x;
curDist < distributions; curDist += gridDim.x) {
// Each block handles one distribution
// First pass, find the total sum of the distribution
- T sum = zero;
+ AccT sum = accZero;
for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) {
- sum = THCNumerics<T>::add(sum, dist[curDist * categories + cat]);
+ sum = THCNumerics<AccT>::add(
+ sum,
+ ScalarConvert<T, AccT>::to(dist[curDist * categories + cat]));
}
// threadIdx.x == 0 has the sum value from this
- sum = reduceBlock(smem, blockDim.x, sum, ReduceAdd<T, T>(), zero);
+ sum = reduceBlock(asmem, blockDim.x, sum, ReduceAdd<AccT, AccT>(), accZero);
// Broadcast sum and sample value
if (threadIdx.x == 0) {
- smem[0] = sum;
- smem[1] = sampled[curDist];
+ // Make sure the sum of our distribution didn't overflow
+ assert(!isinf(sum));
+
+ asmem[0] = sum;
+ smem[0] = sampled[curDist];
}
__syncthreads();
- sum = smem[0];
- T sample = smem[1];
+ sum = asmem[0];
+ T sample = smem[0];
__syncthreads();
- if (THCNumerics<T>::eq(sum, zero) || THCNumerics<T>::eq(sample, zero)) {
+ if (THCNumerics<AccT>::eq(sum, accZero) || THCNumerics<T>::eq(sample, zero)) {
// Choose the first element
if (threadIdx.x == 0) {
- dest[curDist] = 1;
+ dest[curDist] = TH_INDEX_BASE;
}
continue;
@@ -142,16 +154,20 @@ sampleMultinomialOnce(long* dest,
int chunks = THCCeilDiv(categories, (int) blockDim.x);
T prevHighProb = zero;
+ found = false;
- for (int chunk = 0; chunk < chunks; ++chunk) {
+ for (int chunk = 0; chunk < chunks && !found; ++chunk) {
// All threads in bounds load a value
int cat = chunk * blockDim.x + threadIdx.x;
- T val =
- cat < categories ? THCNumerics<T>::div(dist[curDist * categories + cat], sum) :
- zero;
+ AccT val =
+ cat < categories ?
+ THCNumerics<AccT>::div(
+ ScalarConvert<T, AccT>::to(dist[curDist * categories + cat]),
+ sum) :
+ accZero;
- smem[threadIdx.x] = val;
+ smem[threadIdx.x] = ScalarConvert<AccT, T>::to(val);
__syncthreads();
// Perform an inclusive prefix sum of the shared memory contents
@@ -183,8 +199,8 @@ sampleMultinomialOnce(long* dest,
if (inBucket) {
// We're done; we have the sample
// Torch indices are 1-based
- // FIXME: broadcast exit flag?
dest[curDist] = cat + TH_INDEX_BASE;
+ found = true;
}
// Store the previous scan's high value for future use
@@ -192,6 +208,21 @@ sampleMultinomialOnce(long* dest,
__syncthreads();
}
+
+ if (threadIdx.x == 0 && !found) {
+ // This should address a rare bug where we don't select a valid index. This likely occurs when
+ // due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but
+ // and our uniform sample is greater than this value. In this case we likely have unitialized memory
+ // in dest[curDist]. So basically we will loop through the distribution and pick the largest index
+ // where the distribution is non-zero. This is obviously terribly inefficient, but due to the
+ // rarity in which this occurs, this should not be an issue.
+ for (int cat = categories - 1; cat >= 0; --cat) {
+ if (THCNumerics<T>::gt(dist[curDist * categories + cat], zero)) {
+ dest[curDist] = cat + TH_INDEX_BASE;
+ break;
+ }
+ }
+ }
}
}
diff --git a/lib/THC/THCTensorScatterGather.cu b/lib/THC/THCTensorScatterGather.cu
index f3f3928..18c9dee 100644
--- a/lib/THC/THCTensorScatterGather.cu
+++ b/lib/THC/THCTensorScatterGather.cu
@@ -92,7 +92,8 @@ __global__ void THCudaTensor_gatherKernel(
tensor, &tensorOffset,
src, &srcOffset);
- IndexType indexValue = (IndexType)index.data[indexOffset] - TH_INDEX_BASE;
+ long indexValue = index.data[indexOffset] - TH_INDEX_BASE;
+ assert(indexValue >= 0 && indexValue < src.sizes[dim]);
srcOffset += indexValue * src.strides[dim];
tensor.data[tensorOffset] = src.data[srcOffset];
@@ -118,7 +119,8 @@ __global__ void THCudaTensor_scatterKernel(
src, &srcOffset,
tensor, &tensorOffset);
- IndexType indexValue = (IndexType)index.data[indexOffset] - TH_INDEX_BASE;
+ long indexValue = index.data[indexOffset] - TH_INDEX_BASE;
+ assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
tensorOffset += indexValue * tensor.strides[dim];
tensor.data[tensorOffset] = src.data[srcOffset];
@@ -142,7 +144,8 @@ __global__ void THCudaTensor_scatterFillKernel(
index, &indexOffset,
tensor, &tensorOffset);
- IndexType indexValue = (IndexType)index.data[indexOffset] - TH_INDEX_BASE;
+ long indexValue = index.data[indexOffset] - TH_INDEX_BASE;
+ assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
tensorOffset += indexValue * tensor.strides[dim];
tensor.data[tensorOffset] = value;
diff --git a/lib/THC/THCTensorSort.cu b/lib/THC/THCTensorSort.cu
index 589d3e9..f5f7338 100644
--- a/lib/THC/THCTensorSort.cu
+++ b/lib/THC/THCTensorSort.cu
@@ -1,21 +1,5 @@
#include "THCTensorSort.cuh"
-// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
-unsigned long nextHighestPowerOf2(unsigned long n) {
- n--;
- n |= n >> 1;
- n |= n >> 2;
- n |= n >> 4;
- n |= n >> 8;
- n |= n >> 16;
-#ifndef _MSC_VER
- n |= n >> 32;
-#endif
- n++;
-
- return n;
-}
-
void THCudaLongTensor_fillSliceWithIndex(THCState* state,
THCudaLongTensor* t,
int dim) {
diff --git a/lib/THC/THCTensorSort.cuh b/lib/THC/THCTensorSort.cuh
index 381f111..d47ee20 100644
--- a/lib/THC/THCTensorSort.cuh
+++ b/lib/THC/THCTensorSort.cuh
@@ -80,7 +80,6 @@ struct GlobalIndexToPerSliceIndex {
const long sliceSize;
};
-unsigned long nextHighestPowerOf2(unsigned long n);
void THCudaLongTensor_fillSliceWithIndex(THCState* state,
THCudaLongTensor* t,
int dim);
diff --git a/lib/THC/THCTensorTopK.cu b/lib/THC/THCTensorTopK.cu
index ec26178..325d560 100644
--- a/lib/THC/THCTensorTopK.cu
+++ b/lib/THC/THCTensorTopK.cu
@@ -5,531 +5,15 @@
#include "THCAsmUtils.cuh"
#include "THCScanUtils.cuh"
#include "THCTensorTypeUtils.cuh"
+#include "THCTensorMathReduce.cuh"
#include <algorithm> // for std::min
#if CUDA_VERSION >= 7000
#include <thrust/system/cuda/execution_policy.h>
#endif
-// Converts a float to an integer representation with the same
-// sorting; i.e., for floats f1, f2:
-// if f1 < f2 then convert(f1) < convert(f2)
-// We use this to enable radix selection of floating-point values.
-// This also gives a relative order for NaNs, but that's ok, as they
-// will all be adjacent
-struct FloatToSortedInt {
- inline __host__ __device__ FloatToSortedInt() {}
+#include "THCTensorTopK.cuh"
- inline __device__ unsigned int convert(float v) const {
- unsigned int x = __float_as_int(v);
- unsigned int mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
+#include "generic/THCTensorTopK.cu"
+#include "THCGenerateAllTypes.h"
- return (x ^ mask);
- }
-
- inline __device__ float deconvert(unsigned int v) const {
- unsigned int mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
-
- return __int_as_float(v ^ mask);
- }
-};
-
-// This function counts the distribution of all input values in a
-// slice we are selecting by radix digit at `radixDigitPos`, but only
-// those that pass the filter `((v & desiredMask) == desired)`.
-// This produces and broadcasts the seen counts for a single block only.
-// `smem` must have at least `RadixSize` elements.
-template <typename DataType, typename BitDataType,
- typename IndexType, typename CountType,
- typename RadixConverter, int RadixSize, int RadixBits>
-__device__ void countRadixUsingMask(const RadixConverter& conv,
- CountType counts[RadixSize],
- CountType* smem,
- BitDataType desired,
- BitDataType desiredMask,
- int radixDigitPos,
- IndexType sliceSize,
- IndexType withinSliceStride,
- DataType* data) {
- // Clear out per-thread counts from a previous round
-#pragma unroll
- for (int i = 0; i < RadixSize; ++i) {
- counts[i] = 0;
- }
-
- if (threadIdx.x < RadixSize) {
- smem[threadIdx.x] = 0;
- }
- __syncthreads();
-
- // Scan over all the data. Upon a read, the warp will accumulate
- // counts per each digit in the radix using warp voting.
- for (IndexType i = threadIdx.x; i < sliceSize; i += blockDim.x) {
- BitDataType val = conv.convert(doLdg(&data[i * withinSliceStride]));
-
- bool hasVal = ((val & desiredMask) == desired);
- unsigned int digitInRadix = getBitfield(val, radixDigitPos, RadixBits);
-
-#pragma unroll
- for (unsigned int j = 0; j < RadixSize; ++j) {
- bool vote = hasVal && (digitInRadix == j);
- counts[j] += __popc(__ballot(vote));
- }
- }
-
- // Now, for each warp, sum values
- if (getLaneId() == 0) {
-#pragma unroll
- for (unsigned int i = 0; i < RadixSize; ++i) {
- atomicAdd(&smem[i], counts[i]);
- }
- }
-
- __syncthreads();
-
- // For each thread, read in the total counts
-#pragma unroll
- for (unsigned int i = 0; i < RadixSize; ++i) {
- counts[i] = smem[i];
- }
-
- __syncthreads();
-}
-
-// Over what radix we are selecting values
-#define RADIX_BITS 2 // digits are base-(2 ^ RADIX_BITS)
-#define RADIX_SIZE 4 // 2 ^ RADIX_BITS
-#define RADIX_MASK (RADIX_SIZE - 1)
-
-// This finds the unique value `v` that matches the pattern
-// ((v & desired) == desiredMask) in our sorted int format
-template <typename DataType, typename IndexType, typename RadixConverter>
-__device__ float findPattern(const RadixConverter& conv,
- DataType* smem,
- DataType* data,
- IndexType sliceSize,
- IndexType withinSliceStride,
- unsigned int desired,
- unsigned int desiredMask) {
- if (threadIdx.x < 32) {
- smem[threadIdx.x] = (DataType) 0;
- }
- __syncthreads();
-
- // All threads participate in the loop, in order to sync on the flag
- IndexType numIterations = THCRoundUp(sliceSize, (IndexType) blockDim.x);
- for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
- bool inRange = (i < sliceSize);
- DataType v = inRange ? doLdg(&data[i * withinSliceStride]) : (DataType) 0;
-
- if (inRange && ((conv.convert(v) & desiredMask) == desired)) {
- // There should not be conflicts if we are using findPattern,
- // since the result is unique
- smem[0] = (DataType) 1;
- smem[1] = v; // can't use val as the flag, since it could be 0
- }
-
- __syncthreads();
-
- DataType found = smem[0];
- DataType val = smem[1];
-
- __syncthreads();
-
- // Check to see if a thread found the value
- if (found != (DataType) 0) {
- // all threads return this value
- return val;
- }
- }
-
- // should not get here
- assert(false);
- return (DataType) 0;
-}
-
-// Returns the top-Kth element found in the data using radix selection
-template <typename DataType, typename BitDataType, typename IndexType,
- typename RadixConverter, bool Order>
-__device__ void radixSelect(const RadixConverter& conv,
- DataType* data,
- IndexType k,
- IndexType sliceSize,
- IndexType withinSliceStride,
- int* smem,
- DataType* topK) {
- // Per-thread buckets into which we accumulate digit counts in our
- // radix
- int counts[RADIX_SIZE];
-
- // We only consider elements x such that (x & desiredMask) == desired
- // Initially, we consider all elements of the array, so the above
- // statement is true regardless of input.
- unsigned int desired = 0;
- unsigned int desiredMask = 0;
-
- // We are looking for the top kToFind-th element when iterating over
- // digits; this count gets reduced by elimination when counting
- // successive digits
- int kToFind = k;
-
- // We start at the most significant digit in our radix, scanning
- // through to the least significant digit
-#pragma unroll
- for (int digitPos = sizeof(BitDataType) * 8 - RADIX_BITS;
- digitPos >= 0;
- digitPos -= RADIX_BITS) {
-
- // Count radix distribution for the current position and reduce
- // across all threads
- countRadixUsingMask<DataType, BitDataType,
- IndexType, int, RadixConverter,
- RADIX_SIZE, RADIX_BITS>(
- conv, counts, smem,
- desired, desiredMask, digitPos,
- sliceSize, withinSliceStride, data);
-
- // All threads participate in the comparisons below to know the
- // final result
-
-#define CHECK_RADIX(i) \
- int count = counts[i]; \
- \
- /* All threads have the same value in counts here, so all */ \
- /* threads will return from the function. */ \
- if (count == 1 && kToFind == 1) { \
- /* There is a unique answer. */ \
- desired = setBitfield(desired, i, digitPos, RADIX_BITS); \
- desiredMask = \
- setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS); \
- \
- /* The answer is now the unique element v such that: */ \
- /* (v & desiredMask) == desired */ \
- /* However, we do not yet know what the actual element is. We */ \
- /* need to perform a search through the data to find the */ \
- /* element that matches this pattern. */ \
- *topK = findPattern<DataType, IndexType, RadixConverter>( \
- conv, (float*) smem, data, sliceSize, \
- withinSliceStride, desired, desiredMask); \
- return; \
- } \
- \
- if (count >= kToFind) { \
- desired = setBitfield(desired, i, digitPos, RADIX_BITS); \
- desiredMask = \
- setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS); \
- \
- /* The top-Kth element v must now be one such that: */ \
- /* (v & desiredMask == desired) */ \
- /* but we haven't narrowed it down; we must check the next */ \
- /* least-significant digit */ \
- break; \
- } \
- \
- kToFind -= count \
-
- if (Order) {
- // Process in descending order
-#pragma unroll
- for (int i = RADIX_SIZE - 1; i >= 0; --i) {
- CHECK_RADIX(i);
- }
- } else {
- // Process in ascending order
-#pragma unroll
- for (int i = 0; i < RADIX_SIZE; ++i) {
- CHECK_RADIX(i);
- }
- }
-#undef CHECK_RADIX
- } // end digitPos for
-
- // There is no unique result, but there is a non-unique result
- // matching `desired` exactly
- *topK = conv.deconvert(desired);
-}
-
-template <typename IndexType, int Dim, bool Order>
-__global__ void gatherTopK(TensorInfo<float, IndexType> input,
- IndexType inputSliceSize,
- IndexType outputSliceSize, // aka `k`
-
- IndexType numInputSlices,
- IndexType inputWithinSliceStride,
-
- TensorInfo<float, IndexType> topK,
- IndexType numTopKSlices,
- IndexType topKWithinSliceStride,
-
- TensorInfo<long, IndexType> indices,
- IndexType indicesWithinSliceStride) {
- // Indices are limited to integer fp precision, so counts can fit in
- // int32, regardless of IndexType
- __shared__ int smem[32]; // one per each warp, up to warp limit
-
- IndexType slice = getLinearBlockId<IndexType>();
- if (slice >= numInputSlices) {
- return;
- }
-
- // Find the start offset for our slice
- IndexType sliceStartIndex =
- IndexToOffset<float, IndexType, Dim>::get(slice, input);
- IndexType topKSliceStartIndex =
- IndexToOffset<float, IndexType, Dim>::get(slice, topK);
- IndexType indicesSliceStartIndex =
- IndexToOffset<long, IndexType, Dim>::get(slice, indices);
-
- float* inputSliceStart = &input.data[sliceStartIndex];
- float* topKSliceStart = &topK.data[topKSliceStartIndex];
- long* indicesSliceStart = &indices.data[indicesSliceStartIndex];
-
- // Find the k-th highest element in our input
- float topKValue = -1.0f;
- radixSelect<float, unsigned int, IndexType, FloatToSortedInt, Order>(
- FloatToSortedInt(),
- inputSliceStart, outputSliceSize,
- inputSliceSize, inputWithinSliceStride,
- smem, &topKValue);
-
- // Every value that is strictly less/greater than `pattern`
- // (depending on sort dir) in sorted int format is in the top-K.
- // The top-K value itself might not be unique.
- //
- // Since there are a variable number of elements that we see that
- // are within the top-k, we don't know at what index to write out
- // the resulting values.
- // In order to get this, we perform an exclusive prefix sum of
- // `hasTopK`. This will return the resulting index into which we
- // need to write the result, if a thread has a result.
-
- // All threads need to participate in the loop and the prefix sum,
- // but not necessarily in the load; hence loop bounds being rounded
- // up to a multiple of the block dim.
- IndexType numIterations = THCRoundUp(inputSliceSize, (IndexType) blockDim.x);
- IndexType writeIndexStart = 0;
-
- for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
- bool inRange = (i < inputSliceSize);
- float v =
- inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : 0.0f;
- bool hasTopK;
- if (Order) {
- hasTopK = inRange && (v > topKValue);
- } else {
- hasTopK = inRange && (v < topKValue);
- }
-
- int index;
- int carry;
- exclusiveBinaryPrefixSum<int, true>(smem, hasTopK, &index, &carry);
-
- if (hasTopK) {
- int writeIndex = writeIndexStart + index;
- assert(writeIndex < outputSliceSize);
-
- IndexType topKOffset = writeIndex * topKWithinSliceStride;
- IndexType indexOffset = writeIndex * indicesWithinSliceStride;
-
- topKSliceStart[topKOffset] = v;
- indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
- }
-
- writeIndexStart += carry;
- }
-
- // We need to fill in the rest with actual == top-K values.
- // The number that we need is outputSliceSize -
- // writeIndexStart. There might be more than that number available,
- // in which case we have to choose the first seen set. We do this
- // via a prefix sum to calculate indices for writing results.
- assert(outputSliceSize >= writeIndexStart);
- IndexType topKRemaining = (outputSliceSize - writeIndexStart);
-
- for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
- bool inRange = (i < inputSliceSize);
- float v =
- inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : 0.0f;
- bool hasTopK = inRange && (v == topKValue);
-
- int index;
- int carry;
- exclusiveBinaryPrefixSum<int, true>(smem, hasTopK, &index, &carry);
-
- if (hasTopK && index < topKRemaining) {
- int writeIndex = writeIndexStart + index;
- assert(writeIndex < outputSliceSize);
-
- IndexType topKOffset = writeIndex * topKWithinSliceStride;
- IndexType indexOffset = writeIndex * indicesWithinSliceStride;
-
- topKSliceStart[topKOffset] = v;
- indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
- }
-
- if (carry >= topKRemaining) {
- break;
- }
-
- topKRemaining -= carry;
- writeIndexStart += carry;
- }
-}
-
-#undef RADIX_BITS
-#undef RADIX_SIZE
-#undef RADIX_MASK
-
-THC_API void THCudaTensor_topk(THCState* state,
- THCudaTensor *topK,
- THCudaLongTensor *indices,
- THCudaTensor *input,
- long k, int dim, int dir, int sorted) {
- THAssert(topK != NULL && indices != NULL && input != NULL);
- THAssert(THCudaTensor_checkGPU(state, 3, topK, indices, input));
- THCCheckTensorDims(state, topK, 2);
- long dims = THCudaLongTensor_nDimension(state, indices);
- THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
- THCCheckTensorDims(state, input, 2);
-
- int numDims = THCudaTensor_nDimension(state, input);
- THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
-
- long sliceSize = THCudaTensor_size(state, input, dim);
- THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension");
-
- // Build the output size, which is the dim being selected set to
- // size k
- THLongStorage* topKSize = THCudaTensor_newSizeOf(state, input);
- THLongStorage_set(topKSize, dim, k);
- THCudaTensor_resize(state, topK, topKSize, NULL);
- THCudaLongTensor_resize(state, indices, topKSize, NULL);
- THLongStorage_free(topKSize);
-
-#define RUN_K(INDEX_T, DIM, DIR) \
- gatherTopK<INDEX_T, DIM, DIR> \
- <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
- inputInfo, \
- sliceSize, \
- k, \
- inputSlices, \
- /* The actual dimension that the k-selection is running in */ \
- /* may have changed from collapseDims() */ \
- inputInfo.strides[collapseInputDim], \
- topKInfo, \
- topKSlices, \
- topKInfo.strides[collapseTopKDim], \
- indicesInfo, \
- indicesInfo.strides[collapseIndicesDim])
-
-#define RUN_DIR(INDEX_T, DIM) \
- if (dir) { \
- RUN_K(INDEX_T, DIM, true); \
- } else { \
- RUN_K(INDEX_T, DIM, false); \
- }
-
-#define RUN_DIM(INDEX_T) \
- if (allDims == 1) { \
- RUN_DIR(INDEX_T, 1); \
- } else if (allDims == 2) { \
- RUN_DIR(INDEX_T, 2); \
- } else if (allDims == 3) { \
- RUN_DIR(INDEX_T, 3); \
- } else { \
- RUN_DIR(INDEX_T, -1); \
- }
-
-#define RUN_T(INDEX_T) \
- TensorInfo<float, INDEX_T> inputInfo = \
- getTensorInfo<THCudaTensor, INDEX_T>(state, input); \
- TensorInfo<float, INDEX_T> topKInfo = \
- getTensorInfo<THCudaTensor, INDEX_T>(state, topK); \
- TensorInfo<long, INDEX_T> indicesInfo = \
- getTensorInfo<THCudaLongTensor, INDEX_T>(state, indices); \
- \
- /* We use these structures solely to find the offset to */ \
- /* each slice we are operating on */ \
- inputInfo.sizes[dim] = 1; \
- topKInfo.sizes[dim] = 1; \
- indicesInfo.sizes[dim] = 1; \
- \
- /* Collapse all other dims */ \
- int collapseInputDim = inputInfo.collapseDims(dim); \
- int collapseTopKDim = topKInfo.collapseDims(dim); \
- int collapseIndicesDim = indicesInfo.collapseDims(dim); \
- \
- long inputSlices = 1; \
- long topKSlices = 1; \
- for (int i = 0; i < numDims; ++i) { \
- inputSlices *= inputInfo.sizes[i]; \
- topKSlices *= topKInfo.sizes[i]; \
- } \
- \
- dim3 grid; \
- if (!THC_getGridFromTiles(inputSlices, grid)) { \
- THError("Slice to sort is too large"); \
- } \
- \
- dim3 block(std::min(THCRoundUp(sliceSize, 32L), 1024L)); \
- \
- /* This is used as a template parameter to calculate indices. */ \
- /* We only specialize it if all collapsed dim sizes are the */ \
- /* same; otherwise, we use -1 which is the specialization */ \
- /* parameter for arbitrary dimensions */ \
- int allDims = inputInfo.dims; \
- if (topKInfo.dims != allDims || indicesInfo.dims != allDims) { \
- allDims = -1; \
- } \
- \
- RUN_DIM(INDEX_T);
-
- // Based on required index size, run the algorithm with the
- // appropriate index type
- if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input) &&
- TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, topK) &&
- TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
- RUN_T(unsigned int);
- } else {
- RUN_T(unsigned long);
- }
-#undef RUN_T
-#undef RUN_DIM
-#undef RUN_DIR
-#undef RUN_K
-
- // Sort the results if the user wants them sorted, since our
- // selection routine does not ensure sorting
- if (sorted) {
- // FIXME: the k/v inplace sort along slice only works for size <=
- // 2048 at the moment
- if (sliceSize <= 2048) {
- // This avoids any memory allocations and performs all sorting
- // work inplace along the slice
- THCudaTensor_sortKeyValueInplace(state, topK, indices, dim, dir);
- } else {
- // Depend upon the backup sort that returns indices, which we
- // can use in conjunction with gather to produce the original
- // indices.
- // This is not the most efficient implementation, especially since
- // there are memory allocations performed here. If the user desires
- // greater performance, they should torch.gather() the results
- // themselves using the reported indices, providing previously
- // allocated tensors to receive the results.
- THCudaTensor* sortedTopK = THCudaTensor_new(state);
- THCudaLongTensor* sortedIndices = THCudaLongTensor_new(state);
- THCudaTensor_sort(state, sortedTopK, sortedIndices, topK, dim, dir);
-
- THCudaLongTensor* sortedTopKIndices = THCudaLongTensor_new(state);
-
- THCudaLongTensor_resizeAs(state, sortedTopKIndices, indices);
- THCudaLongTensor_gather(state, sortedTopKIndices, indices, dim, sortedIndices);
-
- THCudaTensor_freeCopyTo(state, sortedTopK, topK);
- THCudaLongTensor_freeCopyTo(state, sortedTopKIndices, indices);
- THCudaLongTensor_free(state, sortedIndices);
- }
- }
-
- THCudaCheck(cudaGetLastError());
-}
diff --git a/lib/THC/THCTensorTopK.cuh b/lib/THC/THCTensorTopK.cuh
new file mode 100644
index 0000000..7269e99
--- /dev/null
+++ b/lib/THC/THCTensorTopK.cuh
@@ -0,0 +1,485 @@
+#ifndef THC_TENSOR_TOPK_CUH
+#define THC_TENSOR_TOPK_CUH
+
+template <typename T>
+struct TopKTypeConfig {};
+
+template <>
+struct TopKTypeConfig<float> {
+ typedef unsigned int RadixType;
+
+ // Converts a float to an integer representation with the same
+ // sorting; i.e., for floats f1, f2:
+ // if f1 < f2 then convert(f1) < convert(f2)
+ // We use this to enable radix selection of floating-point values.
+ // This also gives a relative order for NaNs, but that's ok, as they
+ // will all be adjacent
+ static inline __device__ RadixType convert(float v) {
+ RadixType x = __float_as_int(v);
+ RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
+
+ return (x ^ mask);
+ }
+
+ static inline __device__ float deconvert(RadixType v) {
+ RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
+
+ return __int_as_float(v ^ mask);
+ }
+};
+
+template <>
+struct TopKTypeConfig<unsigned char> {
+ typedef unsigned int RadixType;
+
+ static inline __device__ RadixType convert(unsigned char v) {
+ return v;
+ }
+
+ static inline __device__ unsigned char deconvert(RadixType v) {
+ return v;
+ }
+};
+
+template <>
+struct TopKTypeConfig<char> {
+ typedef unsigned int RadixType;
+
+ static inline __device__ RadixType convert(char v) {
+ return 128u + v;
+ }
+
+ static inline __device__ char deconvert(RadixType v) {
+ return v - 128;
+ }
+};
+
+template <>
+struct TopKTypeConfig<short> {
+ typedef unsigned int RadixType;
+
+ static inline __device__ RadixType convert(short v) {
+ assert(sizeof(short) == 2);
+ return 32768u + v;
+ }
+
+ static inline __device__ short deconvert(RadixType v) {
+ return v - 32768;
+ }
+};
+
+template <>
+struct TopKTypeConfig<int> {
+ typedef unsigned int RadixType;
+
+ static inline __device__ RadixType convert(int v) {
+ assert(sizeof(int) == 4);
+ return 2147483648u + v;
+ }
+
+ static inline __device__ int deconvert(RadixType v) {
+ return v - 2147483648u;
+ }
+};
+
+template <>
+struct TopKTypeConfig<long> {
+ typedef unsigned long long int RadixType;
+
+ static inline __device__ RadixType convert(long v) {
+ assert(sizeof(long) == 8);
+ return 9223372036854775808ull + v;
+ }
+
+ static inline __device__ long deconvert(RadixType v) {
+ return v - 9223372036854775808ull;
+ }
+};
+
+template <>
+struct TopKTypeConfig<double> {
+ typedef unsigned long long int RadixType;
+
+ static inline __device__ RadixType convert(double v) {
+ RadixType x = __double_as_longlong(v);
+ RadixType mask = -((x >> 63)) | 0x8000000000000000;
+ return (x ^ mask);
+ }
+
+ static inline __device__ double deconvert(RadixType v) {
+ RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
+ return __longlong_as_double(v ^ mask);
+ }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TopKTypeConfig<half> {
+ typedef unsigned int RadixType;
+
+ static inline __device__ RadixType convert(half v) {
+#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
+ RadixType x = __half_as_ushort(v);
+ RadixType mask = -((x >> 15)) | 0x8000;
+ return (x ^ mask);
+#else
+ assert(false);
+ return 0u;
+#endif
+ }
+
+ static inline __device__ half deconvert(RadixType v) {
+#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
+ RadixType mask = ((v >> 15) - 1) | 0x8000;
+ return __ushort_as_half(v ^ mask);
+#else
+ assert(false);
+ return ScalarConvert<int, half>::to(0);
+#endif
+ }
+};
+#endif // CUDA_HALF_TENSOR
+
+// This function counts the distribution of all input values in a
+// slice we are selecting by radix digit at `radixDigitPos`, but only
+// those that pass the filter `((v & desiredMask) == desired)`.
+// This produces and broadcasts the seen counts for a single block only.
+// `smem` must have at least `RadixSize` elements.
+template <typename DataType, typename BitDataType,
+ typename IndexType, typename CountType,
+ int RadixSize, int RadixBits>
+__device__ void countRadixUsingMask(CountType counts[RadixSize],
+ CountType* smem,
+ BitDataType desired,
+ BitDataType desiredMask,
+ int radixDigitPos,
+ IndexType sliceSize,
+ IndexType withinSliceStride,
+ DataType* data) {
+ // Clear out per-thread counts from a previous round
+#pragma unroll
+ for (int i = 0; i < RadixSize; ++i) {
+ counts[i] = 0;
+ }
+
+ if (threadIdx.x < RadixSize) {
+ smem[threadIdx.x] = 0;
+ }
+ __syncthreads();
+
+ // Scan over all the data. Upon a read, the warp will accumulate
+ // counts per each digit in the radix using warp voting.
+ for (IndexType i = threadIdx.x; i < sliceSize; i += blockDim.x) {
+ BitDataType val = TopKTypeConfig<DataType>::convert(doLdg(&data[i * withinSliceStride]));
+
+ bool hasVal = ((val & desiredMask) == desired);
+ BitDataType digitInRadix = Bitfield<BitDataType>::getBitfield(val, radixDigitPos, RadixBits);
+
+#pragma unroll
+ for (unsigned int j = 0; j < RadixSize; ++j) {
+ bool vote = hasVal && (digitInRadix == j);
+ counts[j] += __popc(__ballot(vote));
+ }
+ }
+
+ // Now, for each warp, sum values
+ if (getLaneId() == 0) {
+#pragma unroll
+ for (unsigned int i = 0; i < RadixSize; ++i) {
+ atomicAdd(&smem[i], counts[i]);
+ }
+ }
+
+ __syncthreads();
+
+ // For each thread, read in the total counts
+#pragma unroll
+ for (unsigned int i = 0; i < RadixSize; ++i) {
+ counts[i] = smem[i];
+ }
+
+ __syncthreads();
+}
+
+// Over what radix we are selecting values
+#define RADIX_BITS 2 // digits are base-(2 ^ RADIX_BITS)
+#define RADIX_SIZE 4 // 2 ^ RADIX_BITS
+#define RADIX_MASK (RADIX_SIZE - 1)
+
+// This finds the unique value `v` that matches the pattern
+// ((v & desired) == desiredMask) in our sorted int format
+template <typename DataType, typename BitDataType, typename IndexType>
+__device__ DataType findPattern(DataType* smem,
+ DataType* data,
+ IndexType sliceSize,
+ IndexType withinSliceStride,
+ BitDataType desired,
+ BitDataType desiredMask) {
+ if (threadIdx.x < 32) {
+ smem[threadIdx.x] = ScalarConvert<int, DataType>::to(0);
+ }
+ __syncthreads();
+
+ // All threads participate in the loop, in order to sync on the flag
+ IndexType numIterations = THCRoundUp(sliceSize, (IndexType) blockDim.x);
+ for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+ bool inRange = (i < sliceSize);
+ DataType v = inRange ? doLdg(&data[i * withinSliceStride]) : ScalarConvert<int, DataType>::to(0);
+
+ if (inRange && ((TopKTypeConfig<DataType>::convert(v) & desiredMask) == desired)) {
+ // There should not be conflicts if we are using findPattern,
+ // since the result is unique
+ smem[0] = ScalarConvert<int, DataType>::to(1);
+ smem[1] = v; // can't use val as the flag, since it could be 0
+ }
+
+ __syncthreads();
+
+ DataType found = smem[0];
+ DataType val = smem[1];
+
+ __syncthreads();
+
+ // Check to see if a thread found the value
+ if (THCNumerics<DataType>::ne(found, ScalarConvert<int, DataType>::to(0))) {
+ // all threads return this value
+ return val;
+ }
+ }
+
+ // should not get here
+ assert(false);
+ return ScalarConvert<int, DataType>::to(0);
+}
+
+// Returns the top-Kth element found in the data using radix selection
+template <typename DataType, typename BitDataType, typename IndexType, bool Order>
+__device__ void radixSelect(DataType* data,
+ IndexType k,
+ IndexType sliceSize,
+ IndexType withinSliceStride,
+ int* smem,
+ DataType* topK) {
+ // Per-thread buckets into which we accumulate digit counts in our
+ // radix
+ int counts[RADIX_SIZE];
+
+ // We only consider elements x such that (x & desiredMask) == desired
+ // Initially, we consider all elements of the array, so the above
+ // statement is true regardless of input.
+ BitDataType desired = 0;
+ BitDataType desiredMask = 0;
+
+ // We are looking for the top kToFind-th element when iterating over
+ // digits; this count gets reduced by elimination when counting
+ // successive digits
+ int kToFind = k;
+
+ // We start at the most significant digit in our radix, scanning
+ // through to the least significant digit
+#pragma unroll
+ for (int digitPos = sizeof(DataType) * 8 - RADIX_BITS;
+ digitPos >= 0;
+ digitPos -= RADIX_BITS) {
+
+ // Count radix distribution for the current position and reduce
+ // across all threads
+ countRadixUsingMask<DataType, BitDataType,
+ IndexType, int,
+ RADIX_SIZE, RADIX_BITS>(
+ counts, smem,
+ desired, desiredMask, digitPos,
+ sliceSize, withinSliceStride, data);
+
+ // All threads participate in the comparisons below to know the
+ // final result
+
+
+#define CHECK_RADIX(i) \
+ int count = counts[i]; \
+ \
+ /* All threads have the same value in counts here, so all */ \
+ /* threads will return from the function. */ \
+ if (count == 1 && kToFind == 1) { \
+ /* There is a unique answer. */ \
+ desired = Bitfield<BitDataType>::setBitfield(desired, i, digitPos, RADIX_BITS); \
+ desiredMask = \
+ Bitfield<BitDataType>::setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS); \
+ \
+ /* The answer is now the unique element v such that: */ \
+ /* (v & desiredMask) == desired */ \
+ /* However, we do not yet know what the actual element is. We */ \
+ /* need to perform a search through the data to find the */ \
+ /* element that matches this pattern. */ \
+ *topK = findPattern<DataType, BitDataType, IndexType>( \
+ (DataType*) smem, data, sliceSize, \
+ withinSliceStride, desired, desiredMask); \
+ return; \
+ } \
+ \
+ if (count >= kToFind) { \
+ desired = Bitfield<BitDataType>::setBitfield(desired, i, digitPos, RADIX_BITS); \
+ desiredMask = \
+ Bitfield<BitDataType>::setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS); \
+ \
+ /* The top-Kth element v must now be one such that: */ \
+ /* (v & desiredMask == desired) */ \
+ /* but we haven't narrowed it down; we must check the next */ \
+ /* least-significant digit */ \
+ break; \
+ } \
+ \
+ kToFind -= count \
+
+ if (Order) {
+ // Process in descending order
+#pragma unroll
+ for (int i = RADIX_SIZE - 1; i >= 0; --i) {
+ CHECK_RADIX(i);
+ }
+ } else {
+ // Process in ascending order
+#pragma unroll
+ for (int i = 0; i < RADIX_SIZE; ++i) {
+ CHECK_RADIX(i);
+ }
+ }
+#undef CHECK_RADIX
+ } // end digitPos for
+
+ // There is no unique result, but there is a non-unique result
+ // matching `desired` exactly
+ *topK = TopKTypeConfig<DataType>::deconvert(desired);
+}
+
+template <typename T, typename IndexType, int Dim, bool Order>
+__global__ void gatherTopK(TensorInfo<T, IndexType> input,
+ IndexType inputSliceSize,
+ IndexType outputSliceSize, // aka `k`
+
+ IndexType numInputSlices,
+ IndexType inputWithinSliceStride,
+
+ TensorInfo<T, IndexType> topK,
+ IndexType numTopKSlices,
+ IndexType topKWithinSliceStride,
+
+ TensorInfo<long, IndexType> indices,
+ IndexType indicesWithinSliceStride) {
+ // Indices are limited to integer fp precision, so counts can fit in
+ // int32, regardless of IndexType
+ __shared__ int smem[32]; // one per each warp, up to warp limit
+
+ IndexType slice = getLinearBlockId<IndexType>();
+ if (slice >= numInputSlices) {
+ return;
+ }
+
+ // Find the start offset for our slice
+ IndexType sliceStartIndex =
+ IndexToOffset<T, IndexType, Dim>::get(slice, input);
+ IndexType topKSliceStartIndex =
+ IndexToOffset<T, IndexType, Dim>::get(slice, topK);
+ IndexType indicesSliceStartIndex =
+ IndexToOffset<long, IndexType, Dim>::get(slice, indices);
+
+ T* inputSliceStart = &input.data[sliceStartIndex];
+ T* topKSliceStart = &topK.data[topKSliceStartIndex];
+ long* indicesSliceStart = &indices.data[indicesSliceStartIndex];
+
+ // Find the k-th highest element in our input
+ T topKValue = ScalarConvert<int, T>::to(0);
+ radixSelect<T, typename TopKTypeConfig<T>::RadixType, IndexType, Order>(
+ inputSliceStart, outputSliceSize,
+ inputSliceSize, inputWithinSliceStride,
+ smem, &topKValue);
+
+ // Every value that is strictly less/greater than `pattern`
+ // (depending on sort dir) in sorted int format is in the top-K.
+ // The top-K value itself might not be unique.
+ //
+ // Since there are a variable number of elements that we see that
+ // are within the top-k, we don't know at what index to write out
+ // the resulting values.
+ // In order to get this, we perform an exclusive prefix sum of
+ // `hasTopK`. This will return the resulting index into which we
+ // need to write the result, if a thread has a result.
+
+ // All threads need to participate in the loop and the prefix sum,
+ // but not necessarily in the load; hence loop bounds being rounded
+ // up to a multiple of the block dim.
+ IndexType numIterations = THCRoundUp(inputSliceSize, (IndexType) blockDim.x);
+ IndexType writeIndexStart = 0;
+
+ for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+ bool inRange = (i < inputSliceSize);
+ T v =
+ inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
+ bool hasTopK;
+ if (Order) {
+ hasTopK = inRange && (THCNumerics<T>::gt(v, topKValue));
+ } else {
+ hasTopK = inRange && (THCNumerics<T>::lt(v, topKValue));
+ }
+
+ int index;
+ int carry;
+ exclusiveBinaryPrefixScan<int, true>(smem, hasTopK, &index, &carry, AddOp<int>());
+
+ if (hasTopK) {
+ int writeIndex = writeIndexStart + index;
+ assert(writeIndex < outputSliceSize);
+
+ IndexType topKOffset = writeIndex * topKWithinSliceStride;
+ IndexType indexOffset = writeIndex * indicesWithinSliceStride;
+
+ topKSliceStart[topKOffset] = v;
+ indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
+ }
+
+ writeIndexStart += carry;
+ }
+
+ // We need to fill in the rest with actual == top-K values.
+ // The number that we need is outputSliceSize -
+ // writeIndexStart. There might be more than that number available,
+ // in which case we have to choose the first seen set. We do this
+ // via a prefix sum to calculate indices for writing results.
+ assert(outputSliceSize >= writeIndexStart);
+ IndexType topKRemaining = (outputSliceSize - writeIndexStart);
+
+ for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+ bool inRange = (i < inputSliceSize);
+ T v =
+ inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
+ bool hasTopK = inRange && (THCNumerics<T>::eq(v, topKValue));
+
+ int index;
+ int carry;
+ exclusiveBinaryPrefixScan<int, true>(smem, hasTopK, &index, &carry, AddOp<int>());
+
+ if (hasTopK && index < topKRemaining) {
+ int writeIndex = writeIndexStart + index;
+ assert(writeIndex < outputSliceSize);
+
+ IndexType topKOffset = writeIndex * topKWithinSliceStride;
+ IndexType indexOffset = writeIndex * indicesWithinSliceStride;
+
+ topKSliceStart[topKOffset] = v;
+ indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
+ }
+
+ if (carry >= topKRemaining) {
+ break;
+ }
+
+ topKRemaining -= carry;
+ writeIndexStart += carry;
+ }
+}
+
+#undef RADIX_BITS
+#undef RADIX_SIZE
+#undef RADIX_MASK
+
+#endif // THC_TENSOR_TOPK_CUH
diff --git a/lib/THC/THCTensorTopK.h b/lib/THC/THCTensorTopK.h
deleted file mode 100644
index 711c047..0000000
--- a/lib/THC/THCTensorTopK.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef TH_CUDA_TENSOR_TOPK_INC
-#define TH_CUDA_TENSOR_TOPK_INC
-
-#include "THCTensor.h"
-
-/* Returns the set of all kth smallest (or largest) elements, depending */
-/* on `dir` */
-THC_API void THCudaTensor_topk(THCState* state,
- THCudaTensor* topK,
- THCudaLongTensor* indices,
- THCudaTensor* input,
- long k, int dim, int dir, int sorted);
-
-#endif
diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu
index bdcbcbe..e4c1c34 100644
--- a/lib/THC/THCTensorTypeUtils.cu
+++ b/lib/THC/THCTensorTypeUtils.cu
@@ -73,6 +73,14 @@ TensorUtils<TENSOR_TYPE>::resizeAs(THCState* state, \
TENSOR_TYPE##_resizeAs(state, dst, src); \
} \
\
+void \
+TensorUtils<TENSOR_TYPE>::squeeze1d(THCState *state, \
+ TENSOR_TYPE *dst, \
+ TENSOR_TYPE *src, \
+ int dimension) { \
+ TENSOR_TYPE##_squeeze1d(state, dst, src, dimension); \
+} \
+ \
DATA_TYPE* \
TensorUtils<TENSOR_TYPE>::getData(THCState* state, \
TENSOR_TYPE* t) { \
diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh
index 273606e..37edb76 100644
--- a/lib/THC/THCTensorTypeUtils.cuh
+++ b/lib/THC/THCTensorTypeUtils.cuh
@@ -49,6 +49,8 @@ struct TensorUtils {
THLongStorage* strides); \
static void resizeAs(THCState* state, TENSOR_TYPE* dst, \
TENSOR_TYPE* src); \
+ static void squeeze1d(THCState *state, TENSOR_TYPE *dst, \
+ TENSOR_TYPE *src, int dimension); \
static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t); \
static ptrdiff_t getNumElements(THCState* state, TENSOR_TYPE* t); \
static long getSize(THCState* state, TENSOR_TYPE* t, int dim); \
diff --git a/lib/THC/generic/THCTensor.c b/lib/THC/generic/THCTensor.c
index 1770535..c227032 100644
--- a/lib/THC/generic/THCTensor.c
+++ b/lib/THC/generic/THCTensor.c
@@ -65,7 +65,6 @@ void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag)
/**** creation methods ****/
static void THCTensor_(rawInit)(THCState *state, THCTensor *self);
-static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride);
/* Empty init */
@@ -81,13 +80,13 @@ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
{
THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
THCTensor_(rawInit)(state, self);
- THCTensor_(rawSet)(state,
- self,
- tensor->storage,
- tensor->storageOffset,
- tensor->nDimension,
- tensor->size,
- tensor->stride);
+ THCTensor_(setStorageNd)(state,
+ self,
+ tensor->storage,
+ tensor->storageOffset,
+ tensor->nDimension,
+ tensor->size,
+ tensor->stride);
return self;
}
@@ -99,13 +98,13 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrd
THArgCheck(size->size == stride->size, 4, "inconsistent size");
THCTensor_(rawInit)(state, self);
- THCTensor_(rawSet)(state,
- self,
- storage,
- storageOffset,
- (size ? size->size : (stride ? stride->size : 0)),
- (size ? size->data : NULL),
- (stride ? stride->data : NULL));
+ THCTensor_(setStorageNd)(state,
+ self,
+ storage,
+ storageOffset,
+ (size ? size->size : (stride ? stride->size : 0)),
+ (size ? size->data : NULL),
+ (stride ? stride->data : NULL));
return self;
}
@@ -141,7 +140,7 @@ THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage, pt
THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
THCTensor_(rawInit)(state, self);
- THCTensor_(rawSet)(state, self, storage, storageOffset, 4, size, stride);
+ THCTensor_(setStorageNd)(state, self, storage, storageOffset, 4, size, stride);
return self;
}
@@ -172,7 +171,7 @@ THCTensor *THCTensor_(newWithSize4d)(THCState *state, long size0, long size1, lo
THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
THCTensor_(rawInit)(state, self);
- THCTensor_(rawResize)(state, self, 4, size, NULL);
+ THCTensor_(resizeNd)(state, self, 4, size, NULL);
return self;
}
@@ -224,6 +223,17 @@ THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimensi
return self;
}
+THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage *size)
+{
+ THArgCheck(THCTensor_(isContiguous)(state, tensor), 2, "input is not contiguous");
+ ptrdiff_t numel = THCTensor_(nElement)(state, tensor);
+ THCTensor *self = THCTensor_(new)(state);
+ THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
+ THCTensor_(setStorage)(state, self, tensor->storage, tensor->storageOffset, inferred_size, NULL);
+ THLongStorage_free(inferred_size);
+ return self;
+}
+
/* Resize */
void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, THLongStorage *stride)
{
@@ -231,7 +241,7 @@ void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, T
if(stride)
THArgCheck(stride->size == size->size, 3, "invalid stride");
- THCTensor_(rawResize)(state, self, size->size, size->data, (stride ? stride->data : NULL));
+ THCTensor_(resizeNd)(state, self, size->size, size->data, (stride ? stride->data : NULL));
}
void THCTensor_(resizeAs)(THCState *state, THCTensor *self, THCTensor *src)
@@ -252,7 +262,7 @@ void THCTensor_(resizeAs)(THCState *state, THCTensor *self, THCTensor *src)
}
if(!isSame)
- THCTensor_(rawResize)(state, self, src->nDimension, src->size, NULL);
+ THCTensor_(resizeNd)(state, self, src->nDimension, src->size, NULL);
}
void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, long size0)
@@ -274,26 +284,48 @@ void THCTensor_(resize4d)(THCState *state, THCTensor *self, long size0, long siz
{
long size[4] = {size0, size1, size2, size3};
- THCTensor_(rawResize)(state, self, 4, size, NULL);
+ THCTensor_(resizeNd)(state, self, 4, size, NULL);
}
void THCTensor_(resize5d)(THCState *state, THCTensor *self, long size0, long size1, long size2, long size3, long size4)
{
long size[5] = {size0, size1, size2, size3, size4};
- THCTensor_(rawResize)(state, self, 5, size, NULL);
+ THCTensor_(resizeNd)(state, self, 5, size, NULL);
+}
+
+THCTensor* THCTensor_(newExpand)(THCState *state, THCTensor *tensor, THLongStorage *sizes) {
+ THArgCheck(THLongStorage_size(sizes) >= THCTensor_(nDimension)(state, tensor), 1, "the number of sizes provided \
+ must be greater or equal to the number of dimensions in the tensor");
+ THArgCheck(THCTensor_(nDimension)(state, tensor) > 0, 0, "can't expand an empty tensor");
+
+ long *expandedSizes;
+ long *expandedStrides;
+ THLongStorage_calculateExpandGeometry(tensor->size,
+ tensor->stride,
+ THCTensor_(nDimension)(state, tensor),
+ sizes,
+ &expandedSizes,
+ &expandedStrides);
+
+ THCTensor *result = THCTensor_(new)(state);
+ THCTensor_(setStorageNd)(state, result, THCTensor_(storage)(state, tensor), THCTensor_(storageOffset)(state, tensor), THLongStorage_size(sizes), expandedSizes, expandedStrides);
+ THFree(expandedSizes);
+ THFree(expandedStrides);
+
+ return result;
}
void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src)
{
if(self != src)
- THCTensor_(rawSet)(state,
- self,
- src->storage,
- src->storageOffset,
- src->nDimension,
- src->size,
- src->stride);
+ THCTensor_(setStorageNd)(state,
+ self,
+ src->storage,
+ src->storageOffset,
+ src->nDimension,
+ src->size,
+ src->stride);
}
void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
@@ -301,13 +333,13 @@ void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storag
if(size_ && stride_)
THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes");
- THCTensor_(rawSet)(state,
- self,
- storage_,
- storageOffset_,
- (size_ ? size_->size : (stride_ ? stride_->size : 0)),
- (size_ ? size_->data : NULL),
- (stride_ ? stride_->data : NULL));
+ THCTensor_(setStorageNd)(state,
+ self,
+ storage_,
+ storageOffset_,
+ (size_ ? size_->size : (stride_ ? stride_->size : 0)),
+ (size_ ? size_->data : NULL),
+ (stride_ ? stride_->data : NULL));
}
void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
@@ -353,7 +385,7 @@ void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *stor
long size[4] = {size0_, size1_, size2_, size3_};
long stride[4] = {stride0_, stride1_, stride2_, stride3_};
- THCTensor_(rawSet)(state, self, storage_, storageOffset_, 4, size, stride);
+ THCTensor_(setStorageNd)(state, self, storage_, storageOffset_, 4, size, stride);
}
@@ -517,6 +549,33 @@ void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int
}
}
+void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension)
+{
+ int d;
+
+ if(!src)
+ src = self;
+
+ THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 3, "dimension out of range");
+ THArgCheck(src->nDimension > 0, 3, "cannot unsqueeze empty tensor");
+
+ THCTensor_(set)(state, self, src);
+
+ self->size = (long*)THRealloc(self->size, sizeof(long)*(self->nDimension+1));
+ self->stride = (long*)THRealloc(self->stride, sizeof(long)*(self->nDimension+1));
+ self->nDimension++;
+ for (d = self->nDimension-1; d > dimension; d--) {
+ self->size[d] = self->size[d-1];
+ self->stride[d] = self->stride[d-1];
+ }
+ if (dimension+1 < self->nDimension) {
+ self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+ } else {
+ self->stride[dimension] = 1;
+ }
+ self->size[dimension] = 1;
+}
+
int THCTensor_(isContiguous)(THCState *state, const THCTensor *self)
{
long z = 1;
@@ -637,7 +696,7 @@ static void THCTensor_(rawInit)(THCState *state, THCTensor *self)
self->flag = TH_TENSOR_REFCOUNTED;
}
-static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
+void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
{
/* storage */
if(self->storage != storage)
@@ -660,10 +719,10 @@ static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *sto
self->storageOffset = storageOffset;
/* size and stride */
- THCTensor_(rawResize)(state, self, nDimension, size, stride);
+ THCTensor_(resizeNd)(state, self, nDimension, size, stride);
}
-void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride)
+void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride)
{
int d;
int nDimension_;
diff --git a/lib/THC/generic/THCTensor.cu b/lib/THC/generic/THCTensor.cu
index 29561ca..8f13a7d 100644
--- a/lib/THC/generic/THCTensor.cu
+++ b/lib/THC/generic/THCTensor.cu
@@ -4,7 +4,7 @@
cudaTextureObject_t THCTensor_(getTextureObject)(THCState *state, THCTensor *self)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
cudaTextureObject_t texObj;
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
diff --git a/lib/THC/generic/THCTensor.h b/lib/THC/generic/THCTensor.h
index 9cd4807..9c5d5be 100644
--- a/lib/THC/generic/THCTensor.h
+++ b/lib/THC/generic/THCTensor.h
@@ -66,6 +66,8 @@ THC_API THCTensor *THCTensor_(newSelect)(THCState *state, THCTensor *tensor, int
THC_API THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int dimension_, long firstIndex_, long size_);
THC_API THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_);
THC_API THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimension_, long size_, long step_);
+THC_API THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage *size);
+THC_API THCTensor *THCTensor_(newExpand)(THCState *state, THCTensor *tensor, THLongStorage *size);
THC_API void THCTensor_(resize)(THCState *state, THCTensor *tensor, THLongStorage *size, THLongStorage *stride);
THC_API void THCTensor_(resizeAs)(THCState *state, THCTensor *tensor, THCTensor *src);
@@ -74,10 +76,11 @@ THC_API void THCTensor_(resize2d)(THCState *state, THCTensor *tensor, long size0
THC_API void THCTensor_(resize3d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_);
THC_API void THCTensor_(resize4d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_, long size3_);
THC_API void THCTensor_(resize5d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_, long size3_, long size4_);
-THC_API void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride);
+THC_API void THCTensor_(resizeNd)(THCState *state, THCTensor *tensor, int nDimension, long *size, long *stride);
THC_API void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src);
THC_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+THC_API void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride);
THC_API void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
long size0_, long stride0_);
THC_API void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
@@ -100,6 +103,7 @@ THC_API void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src
THC_API void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src);
THC_API void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
+THC_API void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
THC_API int THCTensor_(isContiguous)(THCState *state, const THCTensor *self);
THC_API int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor *src);
diff --git a/lib/THC/generic/THCTensorCopy.c b/lib/THC/generic/THCTensorCopy.c
index 874a71e..b5122b8 100644
--- a/lib/THC/generic/THCTensorCopy.c
+++ b/lib/THC/generic/THCTensorCopy.c
@@ -118,12 +118,12 @@ void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor
THCudaCheck(cudaSetDevice(tensorDevice));
}
- cudaStream_t stream = THCState_getCurrentStream(state);
+ THCStream *stream = THCState_getStream(state);
THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, self),
THTensor_(data)(src),
THTensor_(nElement)(src) * sizeof(real),
cudaMemcpyHostToDevice,
- stream));
+ stream->stream));
THCudaCheck(THCCachingHostAllocator_recordEvent(src->storage->data, stream));
@@ -149,12 +149,12 @@ void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor
THCudaCheck(cudaSetDevice(tensorDevice));
}
- cudaStream_t stream = THCState_getCurrentStream(state);
+ THCStream *stream = THCState_getStream(state);
THCudaCheck(cudaMemcpyAsync(THTensor_(data)(self),
THCTensor_(data)(state, src),
THCTensor_(nElement)(state, src) * sizeof(real),
cudaMemcpyDeviceToHost,
- stream));
+ stream->stream));
THCudaCheck(THCCachingHostAllocator_recordEvent(src->storage->data, stream));
diff --git a/lib/THC/generic/THCTensorIndex.cu b/lib/THC/generic/THCTensorIndex.cu
index ce4c790..e388cdd 100644
--- a/lib/THC/generic/THCTensorIndex.cu
+++ b/lib/THC/generic/THCTensorIndex.cu
@@ -4,7 +4,7 @@
void THCTensor_(indexCopy_long)(THCState *state, THCTensor *dst, int dim, THLongTensor *indices, THCTensor *src)
{
- THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
THCudaLongTensor_copyLong(state, indices_, indices);
@@ -16,8 +16,8 @@ void THCTensor_(indexCopy_long)(THCState *state, THCTensor *dst, int dim, THLong
void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, THCTensor *src)
{
- THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
- THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
+ THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
long dims = THCTensor_(nDimension)(state, dst);
THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
@@ -132,7 +132,7 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT
void THCTensor_(indexAdd_long)(THCState *state, THCTensor *dst, int dim, THLongTensor *indices, THCTensor *src)
{
- THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
THCudaLongTensor_copyLong(state, indices_, indices);
@@ -144,8 +144,8 @@ void THCTensor_(indexAdd_long)(THCState *state, THCTensor *dst, int dim, THLongT
void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, THCTensor *src)
{
- THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
- THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
+ THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
long dims = THCTensor_(nDimension)(state, dst);
THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
@@ -260,7 +260,7 @@ void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTe
void THCTensor_(indexFill_long)(THCState *state, THCTensor *dst, int dim, THLongTensor *indices, real val)
{
- THAssert(THCTensor_(checkGPU)(state, 1, dst));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, dst));
THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
THCudaLongTensor_copyLong(state, indices_, indices);
@@ -272,8 +272,8 @@ void THCTensor_(indexFill_long)(THCState *state, THCTensor *dst, int dim, THLong
void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, real val)
{
- THAssert(THCTensor_(checkGPU)(state, 1, dst));
- THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, dst));
+ THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
long dims = THCTensor_(nDimension)(state, dst);
THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
dims = THCudaLongTensor_nDimension(state, indices);
@@ -374,7 +374,7 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT
void THCTensor_(indexSelect_long)(THCState *state, THCTensor *dst, THCTensor *src, int dim, THLongTensor *indices)
{
- THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
THArgCheck(indices->nDimension == 1, 3, "Index is supposed to be a vector");
THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
@@ -387,7 +387,7 @@ void THCTensor_(indexSelect_long)(THCState *state, THCTensor *dst, THCTensor *sr
void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, int dim, THCudaLongTensor *indices)
{
- THAssert(THCTensor_(checkGPU)(state, 3, dst, src, indices));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, dst, src, indices));
long dims = THCTensor_(nDimension)(state, dst);
THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
diff --git a/lib/THC/generic/THCTensorMasked.cu b/lib/THC/generic/THCTensorMasked.cu
index 05d9360..c15edd4 100644
--- a/lib/THC/generic/THCTensorMasked.cu
+++ b/lib/THC/generic/THCTensorMasked.cu
@@ -7,7 +7,7 @@ THC_API void
THCTensor_(maskedFill)(THCState* state,
THCTensor *tensor, THCudaByteTensor *mask, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, tensor, mask));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, mask));
THArgCheck(THCTensor_(nElement)(state, tensor) ==
THCudaByteTensor_nElement(state, mask),
2, "sizes do not match");
@@ -24,7 +24,7 @@ THC_API void
THCTensor_(maskedFillByte)(THCState* state,
THCTensor *tensor, THByteTensor *mask, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 1, tensor));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor));
THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
THLongStorage_free(maskSizes);
@@ -37,7 +37,7 @@ THC_API void
THCTensor_(maskedCopy)(THCState* state,
THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src)
{
- THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask);
ptrdiff_t tensorSize = THCTensor_(nElement)(state, tensor);
ptrdiff_t srcSize = THCTensor_(nElement)(state, src);
@@ -104,7 +104,7 @@ THCTensor_(maskedCopy)(THCState* state,
THC_API void
THCTensor_(maskedCopyByte)(THCState* state,
THCTensor *tensor, THByteTensor *mask, THCTensor *src) {
- THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
THLongStorage_free(maskSizes);
@@ -116,7 +116,7 @@ THCTensor_(maskedCopyByte)(THCState* state,
THC_API void
THCTensor_(maskedSelect)(THCState* state,
THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) {
- THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
THArgCheck(THCudaByteTensor_nElement(state, mask) ==
THCTensor_(nElement)(state, src),
2, "sizes do not match");
@@ -181,7 +181,7 @@ THC_API void
THCTensor_(maskedSelectByte)(THCState* state,
THCTensor *tensor, THCTensor *src, THByteTensor *mask)
{
- THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
THLongStorage_free(maskSizes);
diff --git a/lib/THC/generic/THCTensorMath.cu b/lib/THC/generic/THCTensorMath.cu
index 46746f7..0eed5a9 100644
--- a/lib/THC/generic/THCTensorMath.cu
+++ b/lib/THC/generic/THCTensorMath.cu
@@ -5,7 +5,7 @@
THC_API void
THCTensor_(fill)(THCState* state, THCTensor *self_, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
if (!THC_pointwiseApply1(
state, self_, TensorFillOp<real>(value))) {
@@ -18,7 +18,7 @@ THCTensor_(fill)(THCState* state, THCTensor *self_, real value)
THC_API void
THCTensor_(zero)(THCState *state, THCTensor *self_)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
if (THCTensor_(isContiguous)(state, self_)) {
THCudaCheck(cudaMemsetAsync(THCTensor_(data)(state, self_),
0,
@@ -38,7 +38,7 @@ THCTensor_(zero)(THCState *state, THCTensor *self_)
THC_API void
THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size)
{
- THAssert(THCTensor_(checkGPU)(state, 1, r_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
THCTensor_(resize)(state, r_, size, NULL);
THCTensor_(zero)(state, r_);
}
@@ -46,7 +46,7 @@ THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size)
THC_API void
THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size)
{
- THAssert(THCTensor_(checkGPU)(state, 1, r_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
THCTensor_(resize)(state, r_, size, NULL);
THCTensor_(fill)(state, r_, ScalarConvert<int, real>::to(1));
}
@@ -54,7 +54,7 @@ THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size)
THC_API void
THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size)
{
- THAssert(THCTensor_(checkGPU)(state, 2, r_, t));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, t));
THCTensor_(resize)(state, r_, size, NULL);
THCTensor_(copy)(state, r_, t);
}
@@ -87,8 +87,8 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
// loop below will overwrite the value
int maxDim = dimension + 1;
- // ldimension is the actual dimension we cat along (minus 1, for 0-based indexing)
- int ldimension = dimension;
+ // cat_dimension is the actual dimension we cat along
+ int cat_dimension = dimension;
for (i = 0; i < numInputs; i++)
{
@@ -100,13 +100,13 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
// In the event that the user specified -1 as the concat dimension, then
// we want to pick the maxDim as dimension to cat along (and thus maxDim - 1 as the
// value due to 0-based indexing). If the maxDim is // 0 (i.e. we are catting all
- // empty tensors), then we set ldimension to be 0
+ // empty tensors), then we set cat_dimension to be 0
if (dimension + TH_INDEX_BASE == -1) {
- ldimension = maxDim ? (maxDim - 1) : 0;
+ cat_dimension = maxDim ? (maxDim - 1) : 0;
}
THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
- THArgCheck(ldimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
+ THArgCheck(cat_dimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
size = THLongStorage_newWithSize(maxDim);
for(i = 0; i < maxDim; i++)
@@ -115,7 +115,7 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
long dimSize = i < THCTensor_(nDimension)(state, inputs[0])
? THCTensor_(size)(state, inputs[0], i)
: THMin(THCTensor_(nDimension)(state, inputs[0]), 1);
- if (i == ldimension)
+ if (i == cat_dimension)
{
for (j = 1; j < numInputs; j++)
{
@@ -175,23 +175,9 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
real *data = THCTensor_(data)(state, result);
// Kernel Parameter
- CatArrInputTensor<real, unsigned int> stackInputs[CAT_ARRAY_BATCH_SIZE];
- CatArrInputTensor<real, unsigned int> *d_inputs;
-
- // Attempt to re-use stream's scratch space for the input metadata
- bool usedScratch = false;
size_t tensorMetadataSize = sizeof(CatArrInputTensor<real, unsigned int>) * CAT_ARRAY_BATCH_SIZE;
- if (THCState_getCurrentDeviceScratchSpaceSize(state) > tensorMetadataSize) {
- void* space = THCState_getCurrentDeviceScratchSpace(state);
- if (space) {
- d_inputs = (CatArrInputTensor<real, unsigned int> *) space;
- usedScratch = true;
- }
- }
- if (!usedScratch) {
- // Fallback to allocating GPU memory
- THCudaCheck(THCudaMalloc(state, (void**) &d_inputs, tensorMetadataSize));
- }
+ CatArrInputTensor<real, unsigned int> *d_inputs;
+ THCudaCheck(THCudaMalloc(state, (void**) &d_inputs, tensorMetadataSize));
OutputTensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> param;
@@ -201,17 +187,21 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
param.outputStride[i] = THCTensor_(stride)(state, result, i);
}
+ THCStream* stream = THCState_getStream(state);
+
// Template Declarations for dim = 1, 2, 3, 4
#define HANDLE_CASE(DIMS) \
- CatArrayBatchedCopy<real, unsigned int, DIMS><<<applyGrid, applyBlock>>>(data, d_inputs, param, ldimension, param.outputStride[dimension]);
+ CatArrayBatchedCopy<real, unsigned int, DIMS><<<applyGrid, applyBlock, 0, stream->stream>>>(data, d_inputs, param, cat_dimension, param.outputStride[cat_dimension]);
// Now we loop
offset = 0;
for (i = 0; i < numInputs; i += CAT_ARRAY_BATCH_SIZE) {
+ // Re-allocate stackInputs every iteration to avoid read-after-write hazard
+ CatArrInputTensor<real, unsigned int>* stackInputs = (CatArrInputTensor<real, unsigned int>*) THCudaHostAlloc(state, tensorMetadataSize);
cohortMax = 0;
for (j = 0; j < CAT_ARRAY_BATCH_SIZE && (i+j) < numInputs; ++j) {
- long dimSize = ldimension < THCTensor_(nDimension)(state, inputs[i+j])
- ? THCTensor_(size)(state, inputs[i+j], ldimension)
+ long dimSize = cat_dimension < THCTensor_(nDimension)(state, inputs[i+j])
+ ? THCTensor_(size)(state, inputs[i+j], cat_dimension)
: 1;
stackInputs[j].input = THCTensor_(data)(state, inputs[i+j]);
@@ -223,7 +213,14 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
// update offset
offset += dimSize;
}
- cudaMemcpy(d_inputs, stackInputs, j * sizeof(CatArrInputTensor<real, unsigned int>), cudaMemcpyHostToDevice);
+ THCudaCheck(cudaMemcpyAsync(
+ d_inputs,
+ stackInputs,
+ j * sizeof(CatArrInputTensor<real, unsigned int>),
+ cudaMemcpyHostToDevice,
+ stream->stream));
+ THCudaHostRecord(state, stackInputs);
+ THCudaHostFree(state, stackInputs);
// Next, let's consider how we set our kernel launch parameters.
// We borrow from THCApply, which the kernel's internal indexing
@@ -256,9 +253,7 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
}
THCudaCheck(cudaGetLastError());
}
- if (!usedScratch) {
- THCudaCheck(THCudaFree(state, (void *)d_inputs));
- }
+ THCudaCheck(THCudaFree(state, d_inputs));
#undef HANDLE_CASE
} else {
offset = 0;
@@ -267,12 +262,12 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
// No reason to copy when input is empty
if (!THCTensor_(nDimension)(state, inputs[j])) continue;
- long dimSize = ldimension < THCTensor_(nDimension)(state, inputs[j])
- ? THCTensor_(size)(state, inputs[j], ldimension)
+ long dimSize = cat_dimension < THCTensor_(nDimension)(state, inputs[j])
+ ? THCTensor_(size)(state, inputs[j], cat_dimension)
: 1;
THCTensor *nt = THCTensor_(newWithTensor)(state, result);
- THCTensor_(narrow)(state, nt, NULL, ldimension, offset, dimSize);
+ THCTensor_(narrow)(state, nt, NULL, cat_dimension, offset, dimSize);
THCTensor_(copy)(state, nt, inputs[j]);
THCTensor_(free)(state, nt);
offset += dimSize;
@@ -283,8 +278,8 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor,
THCTensor *self)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self ));
- THAssert(THCudaLongTensor_checkGPU(state, 1, tensor));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self ));
+ THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, tensor));
using namespace thrust::placeholders;
@@ -348,7 +343,7 @@ void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor,
}
void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, long k){
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
int nDimension = THCTensor_(nDimension)(state, src_);
THArgCheck((nDimension == 2) || (nDimension == 1), 1, "expected a matrix or a vector");
if (nDimension == 2) {
@@ -382,7 +377,7 @@ void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, long k
}
accreal THCTensor_(trace)(THCState *state, THCTensor *src_) {
- THAssert(THCTensor_(checkGPU)(state, 1, src_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, src_));
THArgCheck((src_->nDimension == 2), 1, "expected a matrix");
THCTensor *diag = THCTensor_(new)(state);
THCTensor_(diag)(state, diag, src_, 0);
@@ -391,4 +386,67 @@ accreal THCTensor_(trace)(THCState *state, THCTensor *src_) {
return trace;
}
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+void THCTensor_(linspace)(THCState *state, THCTensor *r_, real a, real b, long n) {
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
+ THArgCheck(n > 1 || (n == 1 && (a == b)), 3, "invalid number of points");
+ if (THCTensor_(nElement)(state, r_) != n) THCTensor_(resize1d)(state, r_, n);
+ if (n == 1) THCTensor_(fill)(state, r_, a);
+ else {
+ THCTensor *r = THCTensor_(isContiguous)(state, r_)
+ ? r_ // if r_ is contiguous we can direct work on it
+ : THCTensor_(newContiguous)(state, r_);
+ real step = THCNumerics<real>::div(THCNumerics<real>::sub(b, a),
+ ScalarConvert<long,real>::to(n - 1));
+ LinspaceOp<real> linspace_method(a, step);
+ thrust::device_ptr<real> data_(THCTensor_(data)(state, r));
+ thrust::tabulate(data_, data_ + n, linspace_method);
+ if (!THCTensor_(isContiguous)(state, r_)) { // We need to move data back to r_
+ THCTensor_(freeCopyTo)(state, r, r_);
+ }
+ }
+ THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(logspace)(THCState *state, THCTensor *r_, real a, real b, long n) {
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
+ THArgCheck(n > 1 || (n == 1 && (a == b)), 3, "invalid number of points");
+ if (THCTensor_(nElement)(state, r_) != n) THCTensor_(resize1d)(state, r_, n);
+ if (n == 1) THCTensor_(fill)(state, r_, THCNumerics<real>::exp10(a));
+ else {
+ THCTensor *r = THCTensor_(isContiguous)(state, r_)
+ ? r_
+ : THCTensor_(newContiguous)(state, r_);
+ real step = THCNumerics<real>::div(THCNumerics<real>::sub(b, a),
+ ScalarConvert<long,real>::to(n - 1));
+ LogspaceOp<real> logspace_method(a, step);
+ thrust::device_ptr<real> data_(THCTensor_(data)(state, r));
+ thrust::tabulate(data_, data_ + n, logspace_method);
+ if (!THCTensor_(isContiguous)(state, r_)) {
+ THCTensor_(freeCopyTo)(state, r, r_);
+ }
+ }
+ THCudaCheck(cudaGetLastError());
+}
+
+#endif
+
+void THCTensor_(range)(THCState *state, THCTensor *r_, accreal xmin, accreal xmax, accreal step) {
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
+ THArgCheck(step > 0 || step < 0, 3, "step must be a non-null number");
+ THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin))
+ , 2, "upper bound and larger bound incoherent with step sign");
+ ptrdiff_t size = (ptrdiff_t) (((xmax - xmin) / step) + 1);
+ if (THCTensor_(nElement)(state, r_) != size) THCTensor_(resize1d)(state, r_, size);
+ THCTensor *r = THCTensor_(isContiguous)(state, r_)
+ ? r_
+ : THCTensor_(newContiguous)(state, r_);
+ LinspaceOp<real,accreal> linspace_method(xmin, step);
+ thrust::device_ptr<real> data_(THCTensor_(data)(state, r));
+ thrust::tabulate(data_, data_ + size, linspace_method);
+ if (!THCTensor_(isContiguous)(state, r_)) THCTensor_(freeCopyTo)(state, r, r_);
+ THCudaCheck(cudaGetLastError());
+}
+
#endif
diff --git a/lib/THC/generic/THCTensorMath.h b/lib/THC/generic/THCTensorMath.h
index 2b8f563..aae6775 100644
--- a/lib/THC/generic/THCTensorMath.h
+++ b/lib/THC/generic/THCTensorMath.h
@@ -18,5 +18,13 @@ THC_API void THCTensor_(triu)(THCState *state, THCTensor *self, THCTensor *src,
THC_API void THCTensor_(diag)(THCState *state, THCTensor *self, THCTensor *src, long k);
THC_API accreal THCTensor_(trace)(THCState *state, THCTensor *self);
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(linspace)(THCState *state, THCTensor *r_, real a, real b, long n);
+THC_API void THCTensor_(logspace)(THCState *state, THCTensor *r_, real a, real b, long n);
+
+#endif
+
+THC_API void THCTensor_(range)(THCState *state, THCTensor *r_, accreal xmin, accreal xmax, accreal step);
#endif
diff --git a/lib/THC/generic/THCTensorMathBlas.cu b/lib/THC/generic/THCTensorMathBlas.cu
index f8d85cf..0d47750 100644
--- a/lib/THC/generic/THCTensorMathBlas.cu
+++ b/lib/THC/generic/THCTensorMathBlas.cu
@@ -6,7 +6,7 @@ THC_API accreal
THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
{
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
- THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
THArgCheck(THCTensor_(nElement)(state, self) ==
THCTensor_(nElement)(state, src), 2, "sizes do not match");
@@ -44,7 +44,7 @@ THC_API void
THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec)
{
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
- THAssert(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
if( (mat->nDimension != 2) || (vec->nDimension != 1) )
THError("matrix and vector expected");
@@ -135,7 +135,7 @@ THC_API void
THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2)
{
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
- THAssert(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
if ( (vec1->nDimension != 1) || (vec2->nDimension != 1) ) {
THError("vector and vector expected");
}
@@ -154,7 +154,9 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
THCTensor_(copy)(state, r_, t);
}
- if(THCNumerics<real>::ne(beta, ScalarConvert<int, real>::to(1))) {
+ if(THCNumerics<real>::eq(beta, ScalarConvert<int, real>::to(0))) {
+ THCTensor_(zero)(state, r_);
+ } else if(THCNumerics<real>::ne(beta, ScalarConvert<int, real>::to(1))) {
THCTensor_(mul)(state, r_, r_, beta);
}
@@ -227,7 +229,7 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
{
#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
- THAssert(THCTensor_(checkGPU)(state, 4, r_, t, m1, m2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, m1, m2));
char transpose_r, transpose_m1, transpose_m2;
THCTensor *r__, *m1_, *m2_;
@@ -378,7 +380,7 @@ THC_API void
THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
real alpha, THCTensor *batch1, THCTensor *batch2) {
#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
- THAssert(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
THArgCheck(THCTensor_(nDimension)(state, t) == 2, 4, "expected 2D tensor");
THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
@@ -430,8 +432,8 @@ __global__ void createBatchGemmBuffer(const real** buffer, real* data,
THC_API void
THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
real alpha, THCTensor *batch1, THCTensor *batch2) {
-#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
- THAssert(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
THArgCheck(THCTensor_(nDimension)(state, t) == 3, 4, "expected 3D tensor");
THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
@@ -522,8 +524,10 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
ldb = batch2_->stride[1];
}
- // Compute pointers to matrices in each batch.
long num_batches = result_->size[0];
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ // Compute pointers to matrices in each batch.
size_t matrices_size = num_batches * sizeof(real*);
// Copy pointers to device.
@@ -580,6 +584,24 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
THCudaFree(state, d_matrices2);
THCudaFree(state, d_result_matrices);
+#elif defined(THC_REAL_IS_HALF)
+ // Currently no HgemmBatched in Cublas
+ for (long i = 0; i < num_batches; ++i) {
+ THCudaBlas_Hgemm(
+ state,
+ transpose_batch1,
+ transpose_batch2,
+ result_->size[transpose_result ? 2 : 1],
+ result_->size[transpose_result ? 1 : 2],
+ batch1_->size[transpose_result ? 1 : 2],
+ alpha,
+ THCTensor_(data)(state, batch1_) + i * batch1_->stride[0], lda,
+ THCTensor_(data)(state, batch2_) + i * batch2_->stride[0], ldb,
+ beta,
+ THCTensor_(data)(state, result_) + i * result_->stride[0], ldc);
+ }
+#endif
+
if (batch1_ != batch1) {
THCTensor_(free)(state, batch1_);
}
@@ -597,4 +619,208 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
#endif
}
+THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, THCTensor *a)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ THAssert(THCTensor_(checkGPU)(state, 2, ra_, a));
+ THArgCheck(THCTensor_(nDimension)(state, a) == 3, 3, "expected 3D tensor");
+ THArgCheck(THCTensor_(size)(state, a, 1) ==
+ THCTensor_(size)(state, a, 2), 3, "matrices must be square");
+
+ if (ra_ != a) {
+ THCTensor_(resizeAs)(state, ra_, a);
+ // not sure if this is kosher, but things are nicer if we return in column major
+ if (ra_->stride[0] == 1) {
+ THCTensor_(transpose)(state, ra_, NULL, 1, 0);
+ } else if (ra_->stride[2] == 1) {
+ THCTensor_(transpose)(state, ra_, NULL, 1, 2);
+ }
+ THCTensor_(copy)(state, ra_, a);
+ }
+
+
+ int n = a->size[1];
+ int lda;
+ THCTensor *ra__;
+
+ if (ra_->stride[1] == 1) {
+ // column ordered, what BLAS wants
+ lda = ra_->stride[2];
+ ra__ = ra_;
+ } else {
+ // not column ordered, need to make it such (requires copy)
+ THCTensor *transp_r_ = THCTensor_(newTranspose)(state, ra_, 1, 2);
+ ra__ = THCTensor_(newClone)(state, transp_r_);
+ THCTensor_(free)(state, transp_r_);
+ THCTensor_(transpose)(state, ra__, NULL, 1, 2);
+ lda = ra__->stride[2];
+ }
+
+ long num_batches = ra__->size[0];
+
+ THCudaIntTensor_resize2d(state, rpivots_, num_batches, n);
+ int *pivots_gpu = THCudaIntTensor_data(state, rpivots_);
+
+ bool free_rinfo_ = !rinfo_;
+ if (rinfo_ == NULL) rinfo_ = THCudaIntTensor_new(state);
+ THCudaIntTensor_resize1d(state, rinfo_, num_batches);
+ int *info_gpu = THCudaIntTensor_data(state, rinfo_);
+
+ // Copy pointers to device.
+ real **d_result;
+ size_t matrices_size = num_batches * sizeof(real*);
+ THCudaCheck(THCudaMalloc(state, (void**)&d_result, matrices_size));
+
+ const long block = 512;
+ const long grid = (num_batches + block - 1) / block;
+ createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+ (const real**)d_result, THCTensor_(data)(state, ra__),
+ ra__->stride[0], num_batches);
+
+#ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgetrf(state, n, d_result, lda, pivots_gpu, info_gpu, num_batches);
+#elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgetrf(state, n, d_result, lda, pivots_gpu, info_gpu, num_batches);
+#endif
+
+ THCudaFree(state, d_result);
+
+ if (ra__ != ra_) {
+ THCTensor_(freeCopyTo)(state, ra__, ra_);
+ }
+
+ if (free_rinfo_) {
+ real min = THCudaIntTensor_minall(state, rinfo_);
+ real max = THCudaIntTensor_maxall(state, rinfo_);
+ THCudaIntTensor_free(state, rinfo_);
+ if (min != 0 || max != 0) {
+ THError("failed to factorize some batch elements (min info == %d, max info == %d)",
+ min, max);
+ }
+ }
+
+#else
+ THError("unimplemented data type");
+#endif
+}
+
+
+THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b,
+ THCTensor *atf, THCudaIntTensor *pivots)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ THAssert(THCTensor_(checkGPU)(state, 3, rb_, atf, b));
+ THArgCheck(THCTensor_(nDimension)(state, atf) == 3, 3, "expected 3D tensor");
+ THArgCheck(THCTensor_(nDimension)(state, b) == 3 ||
+ THCTensor_(nDimension)(state, b) == 2, 4, "expected 2D or 3D tensor");
+ THArgCheck(THCTensor_(size)(state, atf, 0) ==
+ THCTensor_(size)(state, b, 0), 3, "number of batches must be equal");
+ THArgCheck(THCTensor_(size)(state, atf, 1) ==
+ THCTensor_(size)(state, atf, 2), 3, "A matrices must be square");
+ THArgCheck(THCTensor_(size)(state, atf, 1) ==
+ THCTensor_(size)(state, b, 1), 3, "dimensions of A and b must be equal");
+
+ if (rb_ != b) {
+ THCTensor_(resizeAs)(state, rb_, b);
+ THCTensor_(copy)(state, rb_, b);
+ }
+
+
+ int n = atf->size[1];
+ int nrhs = rb_->nDimension > 2 ? rb_->size[2] : 1;
+ THCTensor *atf_;
+ THCTensor *rb__;
+ int lda, ldb;
+
+ // correct ordering of A_tf
+ if (atf->stride[1] == 1) {
+ // column ordered, what BLAS wants
+ lda = atf->stride[2];
+ atf_ = atf;
+ } else {
+ // not column ordered, need to make it such (requires copy)
+ // it would be nice if we could use the op(A) flags to automatically
+ // transpose A if needed, but this leads to unpredictable behavior if the
+ // user clones A_tf later with a different ordering
+ THCTensor *transp_r_ = THCTensor_(newTranspose)(state, atf, 1, 2);
+ atf_ = THCTensor_(newClone)(state, transp_r_);
+ THCTensor_(free)(state, transp_r_);
+ THCTensor_(transpose)(state, atf_, NULL, 1, 2);
+ lda = atf_->stride[2];
+ }
+
+ // correct ordering of B
+ if (rb_->stride[1] == 1) {
+ // column ordered
+ if (rb_->nDimension == 2 || rb_->size[2] == 1) {
+ ldb = n;
+ } else {
+ ldb = rb_->stride[2];
+ }
+ rb__ = rb_;
+ } else {
+ // make column ordered
+ if (rb_->nDimension > 2) {
+ THCTensor *transp_r_ = THCTensor_(newTranspose)(state, rb_, 1, 2);
+ rb__ = THCTensor_(newClone)(state, transp_r_);
+ THCTensor_(free)(state, transp_r_);
+ THCTensor_(transpose)(state, rb__, NULL, 1, 2);
+ ldb = rb__->stride[2];
+ } else {
+ rb__ = THCTensor_(newClone)(state, rb_);
+ ldb = n;
+ }
+ }
+
+ long num_batches = rb_->size[0];
+ size_t matrices_size = num_batches * sizeof(real*);
+
+ // Copy pointers to device.
+ real **d_result;
+ const real **d_atf;
+ THCudaCheck(THCudaMalloc(state, (void**)&d_result, matrices_size));
+ THCudaCheck(THCudaMalloc(state, (void**)&d_atf, matrices_size));
+
+ const long block = 512;
+ const long grid = (num_batches + block - 1) / block;
+ createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+ (const real**)d_result, THCTensor_(data)(state, rb__),
+ rb__->stride[0], num_batches);
+ createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+ d_atf, THCTensor_(data)(state, atf_),
+ atf_->stride[0], num_batches);
+
+ if (!THCudaIntTensor_isContiguous(state, pivots)) {
+ THError("Error: pivots is not contiguous.");
+ }
+
+ int *pivots_data = THCudaIntTensor_data(state, pivots);
+ int info;
+
+#ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgetrs(state, 'n', n, nrhs, d_atf, lda, pivots_data, d_result, ldb, &info, num_batches);
+#elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgetrs(state, 'n', n, nrhs, d_atf, lda, pivots_data, d_result, ldb, &info, num_batches);
+#endif
+
+ if (info < 0) {
+ THError("Illegal arg %d", -info);
+ }
+
+ THCudaFree(state, d_result);
+ THCudaFree(state, d_atf);
+
+ if (atf_ != atf) {
+ THCTensor_(free)(state, atf_);
+ }
+
+ if (rb__ != rb_) {
+ THCTensor_(freeCopyTo)(state, rb__, rb_);
+ }
+
+#else
+ THError("unimplemented data type");
+#endif
+}
+
#endif
diff --git a/lib/THC/generic/THCTensorMathBlas.h b/lib/THC/generic/THCTensorMathBlas.h
index f37910c..1d9ddfa 100644
--- a/lib/THC/generic/THCTensorMathBlas.h
+++ b/lib/THC/generic/THCTensorMathBlas.h
@@ -9,5 +9,8 @@ THC_API void THCTensor_(addr)(THCState *state, THCTensor *self, real beta, THCTe
THC_API void THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2);
THC_API void THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2);
+THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, THCTensor *a);
+THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *atf, THCudaIntTensor *pivots);
+
#endif
diff --git a/lib/THC/generic/THCTensorMathCompare.cu b/lib/THC/generic/THCTensorMathCompare.cu
index 77f1ab5..079583c 100644
--- a/lib/THC/generic/THCTensorMathCompare.cu
+++ b/lib/THC/generic/THCTensorMathCompare.cu
@@ -4,7 +4,7 @@
THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorLTValueOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>(value));
@@ -12,7 +12,7 @@ THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTe
THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorGTValueOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>(value));
@@ -20,7 +20,7 @@ THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTe
THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorLEValueOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>(value));
@@ -28,7 +28,7 @@ THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTe
THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorGEValueOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>(value));
@@ -36,7 +36,7 @@ THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTe
THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorEQValueOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>(value));
@@ -44,7 +44,7 @@ THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTe
THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorNEValueOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>(value));
@@ -52,7 +52,7 @@ THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTe
THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorLTValueOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>(value));
@@ -60,7 +60,7 @@ THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *
THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorGTValueOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>(value));
@@ -68,7 +68,7 @@ THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *
THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorLEValueOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>(value));
@@ -76,7 +76,7 @@ THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *
THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorGEValueOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>(value));
@@ -84,7 +84,7 @@ THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *
THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorEQValueOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>(value));
@@ -92,7 +92,7 @@ THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *
THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THC_logicalValue(state, self_, src,
TensorNEValueOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>(value));
diff --git a/lib/THC/generic/THCTensorMathCompareT.cu b/lib/THC/generic/THCTensorMathCompareT.cu
index 4b59abf..e541641 100644
--- a/lib/THC/generic/THCTensorMathCompareT.cu
+++ b/lib/THC/generic/THCTensorMathCompareT.cu
@@ -5,7 +5,7 @@
THC_API void
THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorLTOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>());
@@ -14,7 +14,7 @@ THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
THC_API void
THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorGTOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>());
@@ -23,7 +23,7 @@ THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
THC_API void
THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorLEOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>());
@@ -32,7 +32,7 @@ THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
THC_API void
THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorGEOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>());
@@ -41,7 +41,7 @@ THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
THC_API void
THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorEQOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>());
@@ -50,7 +50,7 @@ THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
THC_API void
THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorNEOp<typename TensorUtils<THCTensor>::DataType,
unsigned char>());
@@ -59,7 +59,7 @@ THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
THC_API void
THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorLTOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>());
@@ -68,7 +68,7 @@ THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
THC_API void
THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorGTOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>());
@@ -77,7 +77,7 @@ THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
THC_API void
THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorLEOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>());
@@ -86,7 +86,7 @@ THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
THC_API void
THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorGEOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>());
@@ -95,7 +95,7 @@ THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
THC_API void
THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorEQOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>());
@@ -104,7 +104,7 @@ THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
THC_API void
THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THC_logicalTensor(state, self_, src1, src2,
TensorNEOp<typename TensorUtils<THCTensor>::DataType,
typename TensorUtils<THCTensor>::DataType>());
diff --git a/lib/THC/generic/THCTensorMathMagma.cu b/lib/THC/generic/THCTensorMathMagma.cu
index 635834d..c35a83e 100644
--- a/lib/THC/generic/THCTensorMathMagma.cu
+++ b/lib/THC/generic/THCTensorMathMagma.cu
@@ -10,7 +10,7 @@ static void THCTensor_(copyArray1d)(THCState *state, THCTensor *self, real *src,
{
long size[1] = { k };
long stride[1] = { 1 };
- THCTensor_(rawResize)(state, self, 1, size, stride);
+ THCTensor_(resizeNd)(state, self, 1, size, stride);
size_t len = k * sizeof(real);
THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
}
@@ -19,7 +19,7 @@ static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src,
{
long size[2] = { m, n };
long stride[2] = { 1, m };
- THCTensor_(rawResize)(state, self, 2, size, stride);
+ THCTensor_(resizeNd)(state, self, 2, size, stride);
size_t len = m * n * sizeof(real);
THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
}
@@ -54,7 +54,7 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T
long size[2] = { src->size[0], src->size[1] };
long stride[2] = { 1, src->size[0] };
- THCTensor_(rawResize)(state, self, 2, size, stride);
+ THCTensor_(resizeNd)(state, self, 2, size, stride);
THCTensor_(copy)(state, self, src);
return self;
}
@@ -286,13 +286,14 @@ THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_,
#ifdef USE_MAGMA
THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
- magma_vec_t jobu = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec;
- magma_vec_t jobvt = jobu;
+ magma_vec_t jobz = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec;
+ int iunused[1];
int m = a->size[0];
int n = a->size[1];
int k = m < n ? m : n;
- int j = (jobu == MagmaAllVec) ? m : k;
+ int j = (jobz == MagmaAllVec) ? m : k;
+ int jv = (jobz == MagmaAllVec) ? n : k;
real *a_data = th_magma_malloc_pinned<real>(m * n);
THCTensor_(copyTensor2d)(state, a_data, a);
@@ -305,32 +306,36 @@ THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_,
int info;
#if defined(THC_REAL_IS_FLOAT)
- magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info);
+ magma_sgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, iunused, &info);
#else
- magma_dgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info);
+ magma_dgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, iunused, &info);
#endif
int lwork = (int) wkopt;
real *work_data = th_magma_malloc_pinned<real>(lwork);
+ int *iwork = th_magma_malloc_pinned<int>(8 * k);
#if defined(THC_REAL_IS_FLOAT)
- magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info);
+ magma_sgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, iwork, &info);
#else
- magma_dgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info);
+ magma_dgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, iwork, &info);
#endif
if (info > 0)
- THError("MAGMA gesvd : %d superdiagonals failed to converge", info);
+ THError("MAGMA gesdd : the updating process of SBDSDC did not converge (error: %d)", info);
else if (info < 0)
- THError("MAGMA gesvd : Argument %d : illegal value", -info);
+ THError("MAGMA gesdd : Argument %d : illegal value", -info);
THCTensor_(copyArray2d)(state, rv_, rv_data, n, n);
THCTensor_(transpose)(state, rv_, NULL, 0, 1);
+ if (jobz != MagmaAllVec)
+ THCTensor_(narrow)(state, rv_, rv_, 1, 0, jv);
THCTensor_(copyArray2d)(state, ru_, ru_data, m, j);
THCTensor_(copyArray1d)(state, rs_, rs_data, k);
THCTensor_(copyArray2d)(state, ra_, a_data, m, n);
magma_free_pinned(work_data);
+ magma_free_pinned(iwork);
magma_free_pinned(rv_data);
magma_free_pinned(ru_data);
magma_free_pinned(rs_data);
@@ -453,6 +458,11 @@ THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
THCudaCheck(THCudaFree(state, ipiv_gpu));
THCudaCheck(THCudaFree(state, info_gpu));
+
+ THCudaCheck(THCudaFree(state, d_matrices1));
+ THCudaCheck(THCudaFree(state, d_matrices1_const));
+ THCudaCheck(THCudaFree(state, d_matrices2));
+
THCTensor_(freeCopyTo)(state, output, input);
#endif
}
@@ -598,19 +608,42 @@ THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THC
int k = (m < n ? m : n);
#ifdef MAGMA_V2
+#if defined(THC_REAL_IS_FLOAT)
int nb = magma_get_sgeqrf_nb(m, n);
#else
+ int nb = magma_get_dgeqrf_nb(m, n);
+#endif
+#else
+#if defined(THC_REAL_IS_FLOAT)
int nb = magma_get_sgeqrf_nb(m);
+#else
+ int nb = magma_get_dgeqrf_nb(m);
+#endif
#endif
real *a_data = THCTensor_(data)(state, a);
- real *tau_data = th_magma_malloc_pinned<real>(n*n);
-
- THCTensor *work = THCTensor_(newWithSize1d)(state, (2*k + ((n+31)/32)*32)*nb);
+ real *tau_data = th_magma_malloc_pinned<real>(k);
+ THCTensor *work = THCTensor_(newWithSize1d)(state, (2*k + magma_roundup(n, 32))*nb);
real *work_data = THCTensor_(data)(state, work);
int info;
#if defined(THC_REAL_IS_FLOAT)
+ magma_sgeqrf2_gpu(m, n, a_data, m, tau_data, &info);
+#else
+ magma_dgeqrf2_gpu(m, n, a_data, m, tau_data, &info);
+#endif
+
+ if (info != 0)
+ THError("MAGMA geqrf2 : Argument %d : illegal value.", -info);
+
+ THCTensor_(narrow)(state, a, a, 0, 0, k);
+ THCTensor_(triu)(state, rr_, a, 0);
+ THCTensor_(free)(state, a);
+
+ a = THCTensor_(newColumnMajor)(state, rq_, a_);
+ a_data = THCTensor_(data)(state, a);
+
+#if defined(THC_REAL_IS_FLOAT)
magma_sgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
#else
magma_dgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
@@ -622,10 +655,6 @@ THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THC
THCTensor *q = THCTensor_(newColumnMajor)(state, rq_, a);
real *q_data = THCTensor_(data)(state, q);
- THCTensor_(narrow)(state, a, a, 0, 0, k);
- THCTensor_(triu)(state, rr_, a, 0);
- THCTensor_(free)(state, a);
-
#if defined(THC_REAL_IS_FLOAT)
magma_sorgqr_gpu(m, k, k, q_data, m, tau_data, work_data, nb, &info);
#else
diff --git a/lib/THC/generic/THCTensorMathPairwise.cu b/lib/THC/generic/THCTensorMathPairwise.cu
index 0b4094b..def5970 100644
--- a/lib/THC/generic/THCTensorMathPairwise.cu
+++ b/lib/THC/generic/THCTensorMathPairwise.cu
@@ -5,7 +5,7 @@
THC_API void
THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
if (self_ == src_) {
if (!THC_pointwiseApply1(state, self_, TensorAddConstantOp<real>(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -24,7 +24,7 @@ THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
THC_API void
THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
if (self_ == src_) {
if (!THC_pointwiseApply1(state, self_, TensorSubConstantOp<real>(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -43,7 +43,7 @@ THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
THC_API void
THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
if (self_ == src_) {
if (!THC_pointwiseApply1(state, self_, TensorMulConstantOp<real>(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -62,7 +62,7 @@ THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
THC_API void
THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
THArgCheck(value != ScalarConvert<int, real>::to(0), 3, "divide by zero");
if (self_ == src_) {
@@ -81,9 +81,57 @@ THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
}
THC_API void
+THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ THCTensor_(mul)(state, self_, src_, pow(2, value));
+#elif defined(THC_REAL_IS_HALF)
+ return THError("lshift not supported for torch.CudaHalfTensor");
+#else
+ if (self_ == src_) {
+ if (!THC_pointwiseApply1(state, self_, TensorLShiftConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src_);
+
+ if (!THC_pointwiseApply2(state, self_, src_, TensorLShiftConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ THCTensor_(mul)(state, self_, src_, pow(2, value));
+#elif defined(THC_REAL_IS_HALF)
+ return THError("rshift not supported for torch.CudaHalfTensor");
+#else
+ if (self_ == src_) {
+ if (!THC_pointwiseApply1(state, self_, TensorRShiftConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src_);
+
+ if (!THC_pointwiseApply2(state, self_, src_, TensorRShiftConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
if (self_ == src_) {
if (!THC_pointwiseApply1(state, self_, TensorFmodOp<real>(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -102,7 +150,7 @@ THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
THC_API void
THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
if (self_ == src_) {
if (!THC_pointwiseApply1(state, self_, TensorRemainderOp<real>(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -120,7 +168,7 @@ THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, real v
void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, long k)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
THArgCheck(src_->nDimension == 2, 1, "expected a matrix");
THCTensor *src = src_;
@@ -153,7 +201,7 @@ void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, long k
void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, long k)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
THArgCheck(src_->nDimension == 2, 1, "expected a matrix");
THCTensor *src = src_;
@@ -186,7 +234,7 @@ void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, long k
THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
if (!THCTensor_(isSameSizeAs(state, self_, src_))) {
return 0;
}
@@ -210,4 +258,70 @@ THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_
return min != 0;
}
+THC_API void
+THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+ return THError("bitand only supported for integer type tensors");
+#else
+ if (self_ == src_) {
+ if (!THC_pointwiseApply1(state, self_, TensorBitAndConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src_);
+
+ if (!THC_pointwiseApply2(state, self_, src_, TensorBitAndConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+ return THError("bitor only supported for integer type tensors");
+#else
+ if (self_ == src_) {
+ if (!THC_pointwiseApply1(state, self_, TensorBitOrConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src_);
+
+ if (!THC_pointwiseApply2(state, self_, src_, TensorBitOrConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+ return THError("bitxor only supported for integer type tensors");
+#else
+ if (self_ == src_) {
+ if (!THC_pointwiseApply1(state, self_, TensorBitXorConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src_);
+
+ if (!THC_pointwiseApply2(state, self_, src_, TensorBitXorConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+#endif
+}
+
#endif
diff --git a/lib/THC/generic/THCTensorMathPairwise.h b/lib/THC/generic/THCTensorMathPairwise.h
index 261c203..8b6bcd6 100644
--- a/lib/THC/generic/THCTensorMathPairwise.h
+++ b/lib/THC/generic/THCTensorMathPairwise.h
@@ -6,8 +6,13 @@ THC_API void THCTensor_(add)(THCState *state, THCTensor *self, THCTensor *src, r
THC_API void THCTensor_(sub)(THCState *state, THCTensor *self, THCTensor *src, real value);
THC_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, real value);
THC_API void THCTensor_(div)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(lshift)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(rshift)(THCState *state, THCTensor *self, THCTensor *src, real value);
THC_API void THCTensor_(fmod)(THCState *state, THCTensor *self, THCTensor *src, real value);
THC_API void THCTensor_(remainder)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(bitand)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(bitor)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(bitxor)(THCState *state, THCTensor *self, THCTensor *src, real value);
THC_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src);
diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu
index b97908a..cdf4b82 100644
--- a/lib/THC/generic/THCTensorMathPointwise.cu
+++ b/lib/THC/generic/THCTensorMathPointwise.cu
@@ -14,7 +14,7 @@
}; \
\
void THCTensor_(NAME)(THCState* state, THCTensor* self_, THCTensor* src) { \
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); \
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); \
if (self_ == src) { \
if (!THC_pointwiseApply1(state, self_, Tensor_##NAME##_##REAL##_Op())) { \
THArgCheck(false, 2, CUTORCH_DIM_WARNING); \
@@ -36,6 +36,7 @@
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( log, THCNumerics<real>::log, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(lgamma, THCNumerics<real>::lgamma, Real)
IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(log1p, THCNumerics<real>::log1p, Real)
IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( exp, THCNumerics<real>::exp, Real)
IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( cos, THCNumerics<real>::cos, Real)
@@ -66,7 +67,7 @@ IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( abs, THCNumerics<real>::abs, Real)
#undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC
void THCTensor_(sign)(THCState* state, THCTensor* self_, THCTensor* src) {
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
if (self_ == src) {
if (!THC_pointwiseApply1(state, self_, TensorSignOp<real>())) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -85,7 +86,7 @@ void THCTensor_(sign)(THCState* state, THCTensor* self_, THCTensor* src) {
void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, real min_value,
real max_value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
if (self_ == src) {
if (!THC_pointwiseApply1(state, self_, TensorClampOp<real>(min_value, max_value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -104,7 +105,7 @@ void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, real m
THC_API void
THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self, x, y));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, x, y));
int i;
long nd = THCTensor_(nDimension)(state, x);
@@ -140,7 +141,7 @@ THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y,
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
void THCTensor_(sigmoid)(THCState* state, THCTensor* self_, THCTensor* src) {
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
if (self_ == src) {
if (!THC_pointwiseApply1(state, self_, TensorSigmoidOp<real>())) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -157,7 +158,7 @@ void THCTensor_(sigmoid)(THCState* state, THCTensor* self_, THCTensor* src) {
}
void THCTensor_(pow)(THCState *state, THCTensor *self_, THCTensor *src, real value) {
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
if (self_ == src) {
if (!THC_pointwiseApply1(state, self_, TensorPowOp<real>(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -175,7 +176,7 @@ void THCTensor_(pow)(THCState *state, THCTensor *self_, THCTensor *src, real val
void THCTensor_(tpow)(THCState *state, THCTensor *self_, real value, THCTensor *src)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
if (self_ == src) {
if (!THC_pointwiseApply1(state, self_, TensorTPowOp<real>(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -194,7 +195,7 @@ void THCTensor_(tpow)(THCState *state, THCTensor *self_, real value, THCTensor *
THC_API void
THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, real w)
{
- THAssert(THCTensor_(checkGPU)(state, 3, result, a, b));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, result, a, b));
THArgCheck(THCTensor_(nElement)(state, a) ==
THCTensor_(nElement)(state, b), 3, "sizes do not match");
THCTensor_(resizeAs)(state, result, a);
@@ -211,7 +212,7 @@ THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b,
THC_API void
THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THArgCheck(THCTensor_(nElement)(state, src1) ==
THCTensor_(nElement)(state, src2), 3, "sizes do not match");
@@ -249,7 +250,7 @@ THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, real value,
THC_API void
THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THArgCheck(THCTensor_(nElement)(state, src1) ==
THCTensor_(nElement)(state, src2), 3, "sizes do not match");
@@ -291,7 +292,7 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value,
THC_API void
THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THArgCheck(THCTensor_(nElement)(state, src1) ==
THCTensor_(nElement)(state, src2), 3, "sizes do not match");
@@ -315,7 +316,7 @@ THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *
THC_API void
THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THArgCheck(THCTensor_(nElement)(state, src1) ==
THCTensor_(nElement)(state, src2), 3, "sizes do not match");
@@ -339,7 +340,7 @@ THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *
THC_API void
THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
THArgCheck(THCTensor_(nElement)(state, src1) ==
THCTensor_(nElement)(state, src2), 3, "sizes do not match");
@@ -361,9 +362,65 @@ THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *
}
THC_API void
+THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF)
+ return THError("clshift not supported for torch.CudaHalfTensor");
+#else
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THArgCheck(THCTensor_(nElement)(state, src1) ==
+ THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+ if (self_ == src1) {
+ // self /= src2
+ if (!THC_pointwiseApply2(state, self_, src2, TensorLShiftOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src1);
+
+ // self = src1 / src2
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorLShiftOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF)
+ return THError("crshift not supported for torch.CudaHalfTensor");
+#else
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THArgCheck(THCTensor_(nElement)(state, src1) ==
+ THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+ if (self_ == src1) {
+ // self /= src2
+ if (!THC_pointwiseApply2(state, self_, src2, TensorRShiftOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src1);
+
+ // self = src1 / src2
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorRShiftOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
THArgCheck(THCTensor_(nElement)(state, src1) ==
THCTensor_(nElement)(state, src2), 2, "sizes do not match");
@@ -382,7 +439,7 @@ THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s
THC_API void
THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
THArgCheck(THCTensor_(nElement)(state, src1) ==
THCTensor_(nElement)(state, src2), 2, "sizes do not match");
@@ -401,7 +458,7 @@ THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s
THC_API void
THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
THArgCheck(THCTensor_(nElement)(state, src1) ==
THCTensor_(nElement)(state, src2), 2, "sizes do not match");
@@ -420,7 +477,7 @@ THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTen
THC_API void
THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
THArgCheck(THCTensor_(nElement)(state, src1) ==
THCTensor_(nElement)(state, src2), 2, "sizes do not match");
@@ -439,7 +496,7 @@ THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *
THC_API void
THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
if (self == src) {
if (!THC_pointwiseApply1(state, self, TensorMaxValueOp<real>(value))) {
@@ -456,7 +513,7 @@ THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real val
THC_API void
THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
if (self == src) {
if (!THC_pointwiseApply1(state, self, TensorMinValueOp<real>(value))) {
@@ -473,7 +530,7 @@ THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real val
THC_API void
THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, real value, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
if(self_ != t)
{
THCTensor_(resizeAs)(state, self_, t);
@@ -498,7 +555,7 @@ THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, real value,
THC_API void
THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, real value, THCTensor *src1, THCTensor *src2)
{
- THAssert(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
if(self_ != t)
{
THCTensor_(resizeAs)(state, self_, t);
@@ -519,4 +576,87 @@ THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, real value,
THCudaCheck(cudaGetLastError());
}
+THC_API void
+THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ return THError("cbitand is only supported for integer type tensors");
+#else
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THArgCheck(THCTensor_(nElement)(state, src1) ==
+ THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+ if (self_ == src1) {
+ // self /= src2
+ if (!THC_pointwiseApply2(state, self_, src2, TensorBitAndOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src1);
+
+ // self = src1 / src2
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitAndOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ return THError("cbitor is only supported for integer type tensors");
+#else
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THArgCheck(THCTensor_(nElement)(state, src1) ==
+ THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+ if (self_ == src1) {
+ // self /= src2
+ if (!THC_pointwiseApply2(state, self_, src2, TensorBitOrOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src1);
+
+ // self = src1 / src2
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitOrOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ return THError("cbitor is only supported for integer type tensors");
+#else
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THArgCheck(THCTensor_(nElement)(state, src1) ==
+ THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+ if (self_ == src1) {
+ // self /= src2
+ if (!THC_pointwiseApply2(state, self_, src2, TensorBitXorOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src1);
+
+ // self = src1 / src2
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitXorOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+#endif
+}
#endif
diff --git a/lib/THC/generic/THCTensorMathPointwise.h b/lib/THC/generic/THCTensorMathPointwise.h
index 34e594a..17171c0 100644
--- a/lib/THC/generic/THCTensorMathPointwise.h
+++ b/lib/THC/generic/THCTensorMathPointwise.h
@@ -6,6 +6,7 @@
THC_API void THCTensor_(sigmoid)(THCState *state, THCTensor *self, THCTensor *src);
THC_API void THCTensor_(log)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(lgamma)(THCState *state, THCTensor *self, THCTensor *src);
THC_API void THCTensor_(log1p)(THCState *state, THCTensor *self, THCTensor *src);
THC_API void THCTensor_(exp)(THCState *state, THCTensor *self, THCTensor *src);
THC_API void THCTensor_(cos)(THCState *state, THCTensor *self, THCTensor *src);
@@ -44,12 +45,17 @@ THC_API void THCTensor_(csub)(THCState *state, THCTensor *self, THCTensor *src1,
THC_API void THCTensor_(cmul)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
THC_API void THCTensor_(cpow)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
THC_API void THCTensor_(cdiv)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(clshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(crshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
THC_API void THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
THC_API void THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
THC_API void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
THC_API void THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
THC_API void THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real value);
THC_API void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(cbitand)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cbitor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cbitxor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
THC_API void THCTensor_(addcmul)(THCState *state, THCTensor *self, THCTensor* t, real value, THCTensor *src1, THCTensor *src2);
THC_API void THCTensor_(addcdiv)(THCState *state, THCTensor *self, THCTensor* t, real value, THCTensor *src1, THCTensor *src2);
diff --git a/lib/THC/generic/THCTensorMathReduce.cu b/lib/THC/generic/THCTensorMathReduce.cu
index ed0e204..bbc950e 100644
--- a/lib/THC/generic/THCTensorMathReduce.cu
+++ b/lib/THC/generic/THCTensorMathReduce.cu
@@ -3,13 +3,14 @@
#else
THC_API void
-THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, long dimension) {
- THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, long dimension, int keepdim) {
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
if (!THC_reduceDim(state, self, src,
thrust::identity<real>(),
ReduceAdd<real, real>(),
ScalarConvert<int, real>::to(0),
- dimension)) {
+ dimension,
+ keepdim)) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
@@ -17,13 +18,14 @@ THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, long dimension
}
THC_API void
-THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, long dimension) {
- THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, long dimension, int keepdim) {
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
if (!THC_reduceDim(state, self, src,
thrust::identity<real>(),
ReduceMultiply<real, real>(),
ScalarConvert<int, real>::to(1),
- dimension)) {
+ dimension,
+ keepdim)) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
@@ -31,10 +33,10 @@ THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, long dimensio
}
THC_API void
-THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim)
+THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim, int keepdim)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self, src));
- THCTensor_(sum)(state, self, src, dim);
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+ THCTensor_(sum)(state, self, src, dim, keepdim);
THCTensor_(div)(state, self, self, ScalarConvert<long, real>::to(THCTensor_(size)(state, src, dim)));
}
@@ -43,7 +45,7 @@ THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim)
THC_API void
THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension, real maxnorm)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
THCTensor *self_;
THCTensor *src_ = THCTensor_(newTranspose)(state, src, dimension, 0);
THCTensor *data = THCTensor_(newClone)(state, src_);
@@ -70,9 +72,9 @@ THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value,
}
THC_API void
-THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, long dimension, int flag)
+THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, long dimension, int flag, int keepdim)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THLongStorage *dim = THCTensor_(newSizeOf)(state, src);
THLongStorage_set(dim, dimension, 1);
THCTensor_(resize)(state, self_, dim, NULL);
@@ -89,12 +91,16 @@ THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, long dimensio
THCTensor_(free)(state, src);
THCTensor_(freeCopyTo)(state, self, self_);
+
+ if (!keepdim) {
+ THCTensor_(squeeze1d)(state, self_, self_, dimension);
+ }
}
THC_API void
-THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, long dimension, int flag)
+THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, long dimension, int flag, int keepdim)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
THLongStorage *dim = THCTensor_(newSizeOf)(state, src);
THLongStorage_set(dim, dimension, 1);
THCTensor_(resize)(state, self_, dim, NULL);
@@ -111,19 +117,23 @@ THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, long dimensio
THCTensor_(free)(state, src);
THCTensor_(freeCopyTo)(state, self, self_);
+
+ if (!keepdim) {
+ THCTensor_(squeeze1d)(state, self_, self_, dimension);
+ }
}
THC_API accreal
THCTensor_(stdall)(THCState *state, THCTensor *self)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
return THCNumerics<accreal>::sqrt((THCTensor_(varall)(state, self)));
}
THC_API accreal
THCTensor_(varall)(THCState *state, THCTensor *self)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
accreal mean = THCTensor_(meanall)(state, self);
accreal val;
@@ -146,28 +156,28 @@ THCTensor_(varall)(THCState *state, THCTensor *self)
}
THC_API void
-THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension)
+THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension, int keepdim)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(0.0))) {
THC_reduceDim(state, self, src,
TensorNonZeroOp<real>(), ReduceAdd<real, real>(),
- ScalarConvert<float, real>::to(0.0), dimension);
+ ScalarConvert<float, real>::to(0.0), dimension, keepdim);
} else if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(1.0))) {
THC_reduceDim(state, self, src,
TensorNormOp<real, 1>(value), ReduceAdd<real, real>(),
- ScalarConvert<float, real>::to(0.0), dimension);
+ ScalarConvert<float, real>::to(0.0), dimension, keepdim);
} else if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(2.0))) {
THC_reduceDim(state, self, src,
TensorNormOp<real, 2>(value), ReduceAdd<real, real>(),
- ScalarConvert<float, real>::to(0.0), dimension);
+ ScalarConvert<float, real>::to(0.0), dimension, keepdim);
THCTensor_(pow)(state, self, self, ScalarConvert<float, real>::to(0.5));
} else {
THC_reduceDim(state, self, src,
TensorNormOp<real, -1>(value), ReduceAdd<real, real>(),
- ScalarConvert<float, real>::to(0.0), dimension);
+ ScalarConvert<float, real>::to(0.0), dimension, keepdim);
THCTensor_(pow)(state, self, self, THCNumerics<real>::cinv(value));
}
@@ -177,7 +187,7 @@ THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, l
THC_API accreal
THCTensor_(normall)(THCState *state, THCTensor *self, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
accreal result;
if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(0.0))) {
@@ -222,7 +232,7 @@ THCTensor_(normall)(THCState *state, THCTensor *self, real value)
accreal THCTensor_(dist)(THCState *state, THCTensor *self,
THCTensor *src, real value)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
self = THCTensor_(newContiguous)(state, self);
ptrdiff_t size = THCTensor_(nElement)(state, self);
src = THCTensor_(newContiguous)(state, src);
@@ -248,7 +258,7 @@ accreal THCTensor_(dist)(THCState *state, THCTensor *self,
THC_API accreal
THCTensor_(sumall)(THCState *state, THCTensor *self) {
- THAssert(THCTensor_(checkGPU)(state, 1, self));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
accreal val;
if (!THC_reduceAll(state, self,
thrust::identity<real>(),
@@ -265,7 +275,7 @@ THCTensor_(sumall)(THCState *state, THCTensor *self) {
THC_API accreal
THCTensor_(prodall)(THCState *state, THCTensor *self) {
- THAssert(THCTensor_(checkGPU)(state, 1, self));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
accreal val;
if (!THC_reduceAll(state, self,
thrust::identity<real>(),
@@ -276,11 +286,6 @@ THCTensor_(prodall)(THCState *state, THCTensor *self) {
THArgCheck(false, 1, CUTORCH_DIM_WARNING);
}
- val = THCNumerics<accreal>::div(
- val,
- ScalarConvert<long, accreal>::to(THCTensor_(nElement)(state, self)) - 1
- );
-
THCudaCheck(cudaGetLastError());
return val;
}
@@ -288,14 +293,14 @@ THCTensor_(prodall)(THCState *state, THCTensor *self) {
THC_API accreal
THCTensor_(meanall)(THCState *state, THCTensor *self)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
THArgCheck(self->nDimension > 0, 1, "empty Tensor");
return THCTensor_(sumall)(state, self)/THCTensor_(nElement)(state, self);
}
THC_API real
THCTensor_(minall)(THCState *state, THCTensor *self) {
- THAssert(THCTensor_(checkGPU)(state, 1, self));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
real val;
if (!THC_reduceAll(state, self,
thrust::identity<real>(),
@@ -311,7 +316,7 @@ THCTensor_(minall)(THCState *state, THCTensor *self) {
THC_API real
THCTensor_(maxall)(THCState *state, THCTensor *self) {
- THAssert(THCTensor_(checkGPU)(state, 1, self));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
real val;
if (!THC_reduceAll(state, self,
thrust::identity<real>(),
@@ -330,8 +335,9 @@ THCTensor_(max)(THCState *state,
THCTensor *values,
THCudaLongTensor *indices,
THCTensor *src,
- long dimension) {
- THAssert(THCTensor_(checkGPU)(state, 3, values, indices, src));
+ long dimension,
+ int keepdim) {
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src));
thrust::pair<typename TensorUtils<THCTensor>::DataType, long>
init =
@@ -339,7 +345,7 @@ THCTensor_(max)(THCState *state,
THCNumerics<typename TensorUtils<THCTensor>::DataType>::min(), 1);
return THC_reduceDimIndex(
- state, values, indices, src, dimension, init,
+ state, values, indices, src, dimension, keepdim, init,
MaxValuePair<typename TensorUtils<THCTensor>::DataType, long>());
}
@@ -348,8 +354,9 @@ THCTensor_(min)(THCState *state,
THCTensor *values,
THCudaLongTensor *indices,
THCTensor *src,
- long dimension) {
- THAssert(THCTensor_(checkGPU)(state, 3, values, indices, src));
+ long dimension,
+ int keepdim) {
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src));
thrust::pair<typename TensorUtils<THCTensor>::DataType, long>
init =
@@ -357,7 +364,7 @@ THCTensor_(min)(THCState *state,
THCNumerics<typename TensorUtils<THCTensor>::DataType>::max(), 1);
return THC_reduceDimIndex(
- state, values, indices, src, dimension, init,
+ state, values, indices, src, dimension, keepdim, init,
MinValuePair<typename TensorUtils<THCTensor>::DataType, long>());
}
diff --git a/lib/THC/generic/THCTensorMathReduce.h b/lib/THC/generic/THCTensorMathReduce.h
index dc38ed6..095be42 100644
--- a/lib/THC/generic/THCTensorMathReduce.h
+++ b/lib/THC/generic/THCTensorMathReduce.h
@@ -5,9 +5,9 @@
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
THC_API void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension, real max_norm);
-THC_API void THCTensor_(std)(THCState *state, THCTensor *self, THCTensor *src, long dim, int flag);
-THC_API void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension);
-THC_API void THCTensor_(var)(THCState *state, THCTensor *self, THCTensor *src, long dim, int flag);
+THC_API void THCTensor_(std)(THCState *state, THCTensor *self, THCTensor *src, long dim, int flag, int keepdim);
+THC_API void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension, int keepdim);
+THC_API void THCTensor_(var)(THCState *state, THCTensor *self, THCTensor *src, long dim, int flag, int keepdim);
THC_API accreal THCTensor_(stdall)(THCState *state, THCTensor *self);
THC_API accreal THCTensor_(normall)(THCState *state, THCTensor *self, real value);
@@ -15,9 +15,9 @@ THC_API accreal THCTensor_(varall)(THCState *state, THCTensor *self);
#endif
-THC_API void THCTensor_(sum)(THCState *state, THCTensor *self, THCTensor *src, long dim);
-THC_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, long dim);
-THC_API void THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim);
+THC_API void THCTensor_(sum)(THCState *state, THCTensor *self, THCTensor *src, long dim, int keepdim);
+THC_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, long dim, int keepdim);
+THC_API void THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim, int keepdim);
THC_API accreal THCTensor_(sumall)(THCState *state, THCTensor *self);
THC_API accreal THCTensor_(prodall)(THCState *state, THCTensor *self);
@@ -26,11 +26,11 @@ THC_API accreal THCTensor_(meanall)(THCState *state, THCTensor *self);
THC_API void THCTensor_(min)(THCState *state,
THCTensor *values,
THCudaLongTensor *indices,
- THCTensor *src, long dim);
+ THCTensor *src, long dim, int keepdim);
THC_API void THCTensor_(max)(THCState *state,
THCTensor *values,
THCudaLongTensor *indices,
- THCTensor *src, long dim);
+ THCTensor *src, long dim, int keepdim);
THC_API real THCTensor_(minall)(THCState *state, THCTensor *self);
THC_API real THCTensor_(maxall)(THCState *state, THCTensor *self);
diff --git a/lib/THC/generic/THCTensorMathScan.cu b/lib/THC/generic/THCTensorMathScan.cu
index 8a8e434..8242139 100644
--- a/lib/THC/generic/THCTensorMathScan.cu
+++ b/lib/THC/generic/THCTensorMathScan.cu
@@ -2,6 +2,27 @@
#define THC_GENERIC_FILE "generic/THCTensorMathScan.cu"
#else
+#ifndef THC_REAL_IS_HALF
+template<class BinaryFunction>
+__host__ void THCTensor_(scanThrust)(
+ THCState *state,
+ THCTensor *dst,
+ THCTensor *src,
+ BinaryFunction binary_op)
+{
+ THCThrustAllocator thrustAlloc(state);
+ thrust::device_ptr<real> src_data(THCTensor_(data)(state, src));
+ thrust::device_ptr<real> dst_data(THCTensor_(data)(state, dst));
+ ptrdiff_t size = THCTensor_(nElement)(state, src);
+ thrust::inclusive_scan(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+ src_data, src_data + size, dst_data,
+ binary_op);
+}
+#endif
+
template<class BinaryOp>
__host__ void THCTensor_(scanOuterDim)(THCState *state, THCTensor *tgt,
THCTensor *src, long dimension,
@@ -57,12 +78,22 @@ template<class BinaryFunction>
void THCTensor_(scanDim)(THCState *state, THCTensor *self_, THCTensor *src,
long dimension, real init, BinaryFunction binary_op)
{
- THCTensor_(resizeAs)(state, self_, src);
+ // "init" must be the identity element for binary_op
+ int ndim = THCTensor_(nDimension)(state, src);
+ THArgCheck(dimension >= 0 && dimension < ndim, 3, "dimension %d out of range",
+ dimension + TH_INDEX_BASE);
+ THCTensor_(resizeAs)(state, self_, src);
THCTensor *self = THCTensor_(newContiguous)(state, self_);
src = THCTensor_(newContiguous)(state, src);
- if (dimension == THCTensor_(nDimension)(state, src) - 1) {
+#ifndef THC_REAL_IS_HALF
+ if (ndim == 1) {
+ // thrust does not take an "init"
+ THCTensor_(scanThrust)(state, self, src, binary_op);
+ } else
+#endif
+ if (dimension == ndim - 1) {
THCTensor_(scanInnermostDim)(state, self, src, init, binary_op);
} else {
THCTensor_(scanOuterDim)(state, self, src, dimension, init, binary_op);
@@ -74,14 +105,14 @@ void THCTensor_(scanDim)(THCState *state, THCTensor *self_, THCTensor *src,
void THCTensor_(cumsum)(THCState *state, THCTensor *self, THCTensor *src, long dimension)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
return THCTensor_(scanDim)(state, self, src, dimension,
ScalarConvert<float, real>::to(0.0), AddOp<real>());
}
void THCTensor_(cumprod)(THCState *state, THCTensor *self, THCTensor *src, long dimension)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
return THCTensor_(scanDim)(state, self, src, dimension,
ScalarConvert<float, real>::to(1.0), MulOp<real>());
}
diff --git a/lib/THC/generic/THCTensorMode.cu b/lib/THC/generic/THCTensorMode.cu
new file mode 100644
index 0000000..e5a17f2
--- /dev/null
+++ b/lib/THC/generic/THCTensorMode.cu
@@ -0,0 +1,315 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMode.cu"
+#else
+
+THC_API void THCTensor_(calculateMode)(THCState *state,
+ THCTensor *values,
+ THCudaLongTensor *indices,
+ THCTensor *input,
+ THCudaLongStorage *sortBuffer,
+ int dimension,
+ THLongStorage *position) {
+ THAssert(THCTensor_(isContiguous)(state, input));
+
+ // Because the input is contiguous, we want to get a reference to the
+ // location of the buffer at the innermost dimension that we are going
+ // to calculate the mode for --> we do this by manually doing the stride
+ // calculations to get an offset
+ real *data = THCTensor_(data)(state, input);
+ for (int i = 0; i < THLongStorage_size(position); ++i) {
+ data += THLongStorage_data(position)[i] * THCTensor_(stride)(state, input, i);
+ }
+
+ long nElement = THCTensor_(size)(state, input, THCTensor_(nDimension)(state, input) - 1);
+ THCThrustAllocator thrustAlloc(state);
+
+ // Wrap input data, sortBuffer, in Thrust device vectors
+ thrust::device_ptr<real> vecPtr = thrust::device_pointer_cast(data);
+ thrust::device_vector<real> iter(vecPtr, vecPtr + nElement);
+ thrust::device_ptr<long> sbPtr = thrust::device_pointer_cast(THCudaLongStorage_data(state, sortBuffer));
+ thrust::device_vector<long> seq(sbPtr, sbPtr + nElement);
+
+ // Fill sortBuffer with [0, 1, 2, ... nElement - 1]
+ thrust::sequence(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+ thrust::device,
+#endif
+ seq.begin(), seq.end());
+
+ // Sort the input data. The original indices of the data are stored in seq
+ thrust::sort_by_key(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+ thrust::device,
+#endif
+ iter.begin(), iter.end(), seq.begin()
+#if defined(THC_REAL_IS_HALF)
+ , ThrustHalfLess()
+#endif
+ );
+
+ // Count # of unique elements via an inner product between adjacent elements.
+ // Add 1 if two neighboring element are not equal.
+ int unique = 1 + thrust::inner_product(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+ thrust::device,
+#endif
+ iter.begin(), iter.end() - 1, iter.begin() + 1, 0, thrust::plus<int>(),
+#if defined(THC_REAL_IS_HALF)
+ ThrustHalfNotEqualTo()
+#else
+ thrust::not_equal_to<real>()
+#endif
+ );
+
+ // Count frequency of each element
+ thrust::device_vector<real> keys(unique);
+ thrust::device_vector<int> counts(unique);
+ thrust::reduce_by_key(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+ thrust::device,
+#endif
+ iter.begin(), iter.end(),
+ thrust::constant_iterator<int>(1), keys.begin(), counts.begin()
+#if defined(THC_REAL_IS_HALF)
+ , ThrustHalfEqualTo()
+#endif
+ );
+
+ // Find index of maximum count
+ thrust::device_vector<int>::iterator it = thrust::max_element(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+ thrust::device,
+#endif
+ counts.begin(), counts.end());
+ real mode = keys[it - counts.begin()];
+
+ // Find first index within which it occurs
+#if defined(THC_REAL_IS_HALF)
+ thrust::device_vector<real>::iterator positionIter = thrust::find_if(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+ thrust::device,
+#endif
+ iter.begin(), iter.end(), ThrustHalfEqualToPredicate(mode));
+#else
+ thrust::device_vector<real>::iterator positionIter = thrust::find(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+ thrust::device,
+#endif
+ iter.begin(), iter.end(), mode);
+#endif
+
+ THAssert(positionIter != iter.end());
+ long index = TH_INDEX_BASE + seq[positionIter - iter.begin()];
+
+ // Place mode, index in output
+ ptrdiff_t valuesOffset = THCTensor_(storageOffset)(state, values);
+ long indicesOffset = THCudaLongTensor_storageOffset(state, indices);
+
+ for (int i = 0; i < THLongStorage_size(position); ++i) {
+ long pos = THLongStorage_data(position)[i];
+ valuesOffset += THCTensor_(stride)(state, values, i) * pos;
+ indicesOffset += THCudaLongTensor_stride(state, indices, i) * pos;
+ }
+ THCStorage_(set)(state, THCTensor_(storage)(state, values), valuesOffset, mode);
+ THCudaLongStorage_set(state, THCudaLongTensor_storage(state, indices), indicesOffset, index);
+}
+
+// this probably could be a loop, not a recursive algorithm
+THC_API void THCTensor_(dimApplyMode)(THCState *state,
+ THCTensor *values,
+ THCudaLongTensor *indices,
+ THCTensor *input,
+ THCudaLongStorage *sortBuffer,
+ int dimension,
+ THLongStorage *position,
+ int curDim) {
+ long ndim = THCTensor_(nDimension)(state, input);
+
+ // Because we have transposed the Tensor, the data for the dimension we are mode'ing along
+ // is always in the innermost dimension
+ if (curDim == ndim - 1) {
+ THCTensor_(calculateMode)(state, values, indices, input, sortBuffer, dimension, position);
+ } else {
+ // Loop through the values and recurse
+ for (int i = 0; i < THCTensor_(size)(state, input, curDim); ++i) {
+ position->data[curDim] = i;
+ THCTensor_(dimApplyMode)(state, values, indices, input, sortBuffer, dimension, position, curDim + 1);
+ }
+ }
+}
+
+#define MAX_GRID_SIZE 65535
+#define MAX_BLOCK_SIZE 1024
+
+THC_API void THCTensor_(mode)(THCState *state,
+ THCTensor *values,
+ THCudaLongTensor *indices,
+ THCTensor *input,
+ int dimension,
+ int keepdim) {
+ THLongStorage *dim;
+ THCTensor *transposed, *contiguous, *valuesTransposed;
+ THLongStorage *position;
+ THCudaLongStorage *sortBuffer;
+ THCudaLongTensor *indicesTransposed;
+ long ndim, sliceSize, slices;
+
+
+ THAssert(THCTensor_(checkGPU)(state, 1, values));
+
+ // Verify they are asking for a valid dimension
+ ndim = THCTensor_(nDimension)(state, input);
+ THArgCheck(dimension >= 0 && dimension < ndim, 4, "Dimension of out bounds");
+
+ sliceSize = THCTensor_(size)(state, input, dimension);
+ slices = THCTensor_(nElement)(state, input) / sliceSize;
+
+ // Resize output value, index Tensors to appropriate sizes (i.e. the same as
+ // the input Tensor, except at dim=dimension, the size is 1)
+ dim = THCTensor_(newSizeOf)(state, input);
+ THLongStorage_set(dim, dimension, 1);
+ THCTensor_(resize)(state, values, dim, NULL);
+ THCudaLongTensor_resize(state, indices, dim, NULL);
+ THLongStorage_free(dim);
+
+ // If sliceSize is 1, copy input to values and set indices
+ if (sliceSize == 1) {
+ THCTensor_(copy)(state, values, input);
+ THCudaLongTensor_fill(state, indices, TH_INDEX_BASE);
+ return;
+ }
+
+ // Requirements for fused kernel implementation:
+ //
+ // 1. sliceSize <= 2 * max threads per block
+ // 2. uses one block per slice, so number of slices must be less than the maximum number of blocks for
+ // a kernel launch
+ // 3. Can use 32-bit index math for indexing (mainly just for implementation conciseness, could be changed)
+ if (sliceSize <= MAX_BLOCK_SIZE &&
+ slices <= MAX_GRID_SIZE &&
+ TensorUtils<THCTensor>::canUse32BitIndexMath(state, input)) {
+ // Beginning our optimized implementation. First thing we want to do is to transpose
+ // the input Tensor along the sort dimension, and then make it contiguous
+ transposed = THCTensor_(newTranspose)(state, input, dimension, ndim - 1);
+ contiguous = THCTensor_(newContiguous)(state, transposed);
+
+ // We also need to view the values and indices Tensors as transposed in order to
+ // properly determine the offset into the underlying storage in which to place the
+ // mode and index for a particular set of dimension values
+ valuesTransposed = THCTensor_(newTranspose)(state, values, dimension, ndim-1);
+ indicesTransposed = THCudaLongTensor_newTranspose(state, indices, dimension, ndim-1);
+
+ // Set-up TensorInfo structs for passing to kernel
+ TensorInfo<real, unsigned int> tiValues = getTensorInfo<THCTensor, unsigned int>(state, valuesTransposed);
+ TensorInfo<long, unsigned int> tiIndices = getTensorInfo<THCudaLongTensor, unsigned int>(state, indicesTransposed);
+
+ // The number of blocks is the number of slices that we need to calculate the mode for. Each block
+ // is responsible for computing a single mode
+ dim3 grid;
+ THC_getGridFromTiles(slices, grid);
+
+ // The blocksize is two elements per thread, rounded up to the nearest power of 2
+ long ceilPowerOf2 = nextHighestPowerOf2(sliceSize);
+
+ // Macro that calls kernel --> note that we set the block dimensions here, and
+ // the amount of shared memory
+ #define HANDLE_MODE(SIZE) \
+ { \
+ dim3 blockSize(SIZE / 2); \
+\
+ int memsize = (sizeof(real) * SIZE) + (2 * SIZE * sizeof(unsigned int)); \
+ computeMode<real, SIZE> \
+ <<<grid, blockSize, memsize, THCState_getCurrentStream(state)>>>( \
+ THCTensor_(data)(state, contiguous), tiValues, tiIndices, sliceSize); \
+ }
+
+ // Tradeoff between compilation time and the number of specializations. Ideally we would have
+ // one HANDLE_MODE for each power of 2
+ switch(ceilPowerOf2) {
+ case 2048:
+ HANDLE_MODE(2048)
+ break;
+ case 1024:
+ case 512:
+ case 256:
+ HANDLE_MODE(1024)
+ break;
+ case 128:
+ case 64:
+ HANDLE_MODE(128)
+ break;
+ case 32:
+ case 16:
+ case 8:
+ case 4:
+ case 2:
+ HANDLE_MODE(32)
+ break;
+ case 1:
+ default:
+ assert(false);
+ }
+ THCudaCheck(cudaGetLastError());
+
+ THCTensor_(free)(state, transposed);
+ THCTensor_(free)(state, contiguous);
+ THCTensor_(free)(state, valuesTransposed);
+ THCudaLongTensor_free(state, indicesTransposed);
+ } else {
+ // Beginning our naive implementation: We don't want to mutate the input Tensor, but
+ // we need to be able to sort the inputs along the dimension in order to calculate the
+ // mode. Additionally, its ideal if the data along the dimension is contiguous. So
+ // we transpose the dimension with the innermost dimension and make a new contiguous
+ // version that we can use.
+ transposed = THCTensor_(newClone)(state, input);
+ THCTensor_(transpose)(state, transposed, NULL, dimension, ndim - 1);
+ contiguous = THCTensor_(newContiguous)(state, transposed);
+ THCTensor_(free)(state, transposed);
+
+ // We also need to view the values and indices Tensors as transposed in order to
+ // properly determine the offset into the underlying storage in which to place the
+ // mode and index for a particular set of dimension values
+ valuesTransposed = THCTensor_(newTranspose)(state, values, dimension, ndim - 1);
+ indicesTransposed = THCudaLongTensor_newTranspose(state, indices, dimension, ndim - 1);
+
+ // Position is a Storage that will store the dimension values we are processing
+ position = THLongStorage_newWithSize(ndim - 1);
+
+ // Sort Buffer is a Storage that will be used in the internal sort required to calculate
+ // the mode efficiently
+ sortBuffer = THCudaLongStorage_newWithSize(state, sliceSize);
+
+ // Call mode
+ THCTensor_(dimApplyMode)(state, valuesTransposed, indicesTransposed, contiguous, sortBuffer, dimension, position, 0);
+
+ THCTensor_(free)(state, contiguous);
+ THLongStorage_free(position);
+ THCTensor_(free)(state, valuesTransposed);
+ THCudaLongTensor_free(state, indicesTransposed);
+ THCudaLongStorage_free(state, sortBuffer);
+ }
+
+ if (!keepdim) {
+ THCTensor_(squeeze1d)(state, values, values, dimension);
+ THCudaLongTensor_squeeze1d(state, indices, indices, dimension);
+ }
+}
+
+#undef MAX_GRID_SIZE
+#undef MAX_BLOCK_SIZE
+
+#endif
diff --git a/lib/THC/generic/THCTensorMode.h b/lib/THC/generic/THCTensorMode.h
new file mode 100644
index 0000000..6f24380
--- /dev/null
+++ b/lib/THC/generic/THCTensorMode.h
@@ -0,0 +1,14 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMode.h"
+#else
+
+/* Returns the mode, and index of the mode, for the set of values
+ * along a given dimension in the input tensor. */
+THC_API void THCTensor_(mode)(THCState *state,
+ THCTensor *values,
+ THCudaLongTensor *indices,
+ THCTensor *input,
+ int dimension,
+ int keepdim);
+
+#endif // THC_GENERIC_FILE
diff --git a/lib/THC/generic/THCTensorRandom.cu b/lib/THC/generic/THCTensorRandom.cu
index f6d6979..4c6d2fb 100644
--- a/lib/THC/generic/THCTensorRandom.cu
+++ b/lib/THC/generic/THCTensorRandom.cu
@@ -8,7 +8,7 @@
THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
Generator* gen = THCRandom_getGenerator(state);
THCTensor *self = THCTensor_(newContiguous)(state, self_);
ptrdiff_t size = THCTensor_(nElement)(state, self);
@@ -22,7 +22,7 @@ THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, do
THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
Generator* gen = THCRandom_getGenerator(state);
THCTensor *self = THCTensor_(newContiguous)(state, self_);
ptrdiff_t size = THCTensor_(nElement)(state, self);
@@ -37,7 +37,7 @@ THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean,
THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
Generator* gen = THCRandom_getGenerator(state);
THCTensor *self = THCTensor_(newContiguous)(state, self_);
@@ -52,7 +52,7 @@ THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mea
THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
Generator* gen = THCRandom_getGenerator(state);
THCTensor *self = THCTensor_(newContiguous)(state, self_);
@@ -67,7 +67,7 @@ THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double l
THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
Generator* gen = THCRandom_getGenerator(state);
THCTensor *self = THCTensor_(newContiguous)(state, self_);
@@ -107,7 +107,7 @@ THC_API void THCTensor_(multinomial)(struct THCState *state,
int n_sample,
int with_replacement)
{
- THAssert(THCTensor_(checkGPU)(state, 2, self, prob_dist));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, prob_dist));
Generator* gen = THCRandom_getGenerator(state);
int inputSize = THCTensor_(nDimension)(state, prob_dist);
@@ -159,8 +159,9 @@ THC_API void THCTensor_(multinomial)(struct THCState *state,
int maxThreads = props->maxThreadsPerBlock;
dim3 block(numCategories < maxThreads ? numCategories : maxThreads);
dim3 grid(numDist < numSM * 4 ? numDist : numSM * 4);
- sampleMultinomialOnce
- <<<grid, block, block.x * sizeof(real),
+ sampleMultinomialOnce<real, accreal>
+ <<<grid, block,
+ block.x * (sizeof(real) * sizeof(accreal)),
THCState_getCurrentStream(state)>>>(
THCudaLongTensor_data(state, self),
numDist,
@@ -266,14 +267,14 @@ THC_API void THCTensor_(multinomial)(struct THCState *state,
THC_API void THCTensor_(rand)(THCState *state, THCTensor *r_, THLongStorage *size)
{
- THAssert(THCTensor_(checkGPU)(state, 1, r_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
THCTensor_(resize)(state, r_, size, NULL);
THCTensor_(uniform)(state, r_, 0, 1);
}
void THCTensor_(randn)(THCState *state, THCTensor *r_, THLongStorage *size)
{
- THAssert(THCTensor_(checkGPU)(state, 1, r_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
THCTensor_(resize)(state, r_, size, NULL);
THCTensor_(normal)(state, r_, 0, 1);
}
@@ -288,7 +289,7 @@ GENERATE_KERNEL1(generate_bernoulli, real, double p, float, curand_uniform, (Sca
THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
Generator* gen = THCRandom_getGenerator(state);
THCTensor *self = THCTensor_(newContiguous)(state, self_);
ptrdiff_t size = THCTensor_(nElement)(state, self);
@@ -304,7 +305,7 @@ THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
THC_API void THCTensor_(NAME)(THCState* state, \
THCTensor *self_, PROB_TYPE *probs_) \
{ \
- THAssert(THCTensor_(checkGPU)(state, 2, self_, probs_)); \
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, probs_)); \
Generator* gen = THCRandom_getGenerator(state); \
THCTensor *self = THCTensor_(newContiguous)(state, self_); \
PROB_TYPE *probs = PROB_TYPE##_newContiguous(state, probs_); \
@@ -334,7 +335,7 @@ GENERATE_KERNEL1(generate_geometric, real, double p, float, curand_uniform, (Sca
THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
{
- THAssert(THCTensor_(checkGPU)(state, 1, self_));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
Generator* gen = THCRandom_getGenerator(state);
THCTensor *self = THCTensor_(newContiguous)(state, self_);
diff --git a/lib/THC/generic/THCTensorScatterGather.cu b/lib/THC/generic/THCTensorScatterGather.cu
index c120f88..c3afbbf 100644
--- a/lib/THC/generic/THCTensorScatterGather.cu
+++ b/lib/THC/generic/THCTensorScatterGather.cu
@@ -9,8 +9,8 @@
void THCTensor_(gather)(THCState* state, THCTensor *tensor,
THCTensor *src, int dim, THCudaLongTensor *index) {
- THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
- THAssert(THCudaLongTensor_checkGPU(state, 1, index));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
+ THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
THArgCheck(THCTensor_(nDimension)(state, src) == THCTensor_(nDimension)(state, tensor), 2,
"Input tensor must have same dimensions as output tensor");
@@ -102,8 +102,8 @@ void THCTensor_(gather)(THCState* state, THCTensor *tensor,
tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements);
void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src) {
- THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
- THAssert(THCudaLongTensor_checkGPU(state, 1, index));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
+ THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
THArgCheck(dim >= 0 && dim < THCTensor_(nDimension)(state, tensor), 2,
"Index dimension is out of bounds");
@@ -191,8 +191,8 @@ void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLong
void
THCTensor_(scatterFill)(THCState* state, THCTensor *tensor,
int dim, THCudaLongTensor *index, real value) {
- THAssert(THCTensor_(checkGPU)(state, 1, tensor));
- THAssert(THCudaLongTensor_checkGPU(state, 1, index));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor));
+ THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
THArgCheck(dim >= 0 && dim < THCTensor_(nDimension)(state, tensor), 2,
"Index dimension is out of bounds");
diff --git a/lib/THC/generic/THCTensorSort.cu b/lib/THC/generic/THCTensorSort.cu
index afef796..067af89 100644
--- a/lib/THC/generic/THCTensorSort.cu
+++ b/lib/THC/generic/THCTensorSort.cu
@@ -281,8 +281,8 @@ THC_API void THCTensor_(sort)(THCState* state,
THCudaLongTensor *indices,
THCTensor *input,
int dim, int order) {
- THAssert(THCTensor_(checkGPU)(state, 2, sorted, input));
- THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, sorted, input));
+ THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
long dims = THCTensor_(nDimension)(state, sorted);
THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
dims = THCTensor_(nDimension)(state, input);
diff --git a/lib/THC/generic/THCTensorTopK.cu b/lib/THC/generic/THCTensorTopK.cu
new file mode 100644
index 0000000..83ab1e1
--- /dev/null
+++ b/lib/THC/generic/THCTensorTopK.cu
@@ -0,0 +1,159 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorTopK.cu"
+#else
+
+THC_API void THCTensor_(topk)(THCState* state,
+ THCTensor *topK,
+ THCudaLongTensor *indices,
+ THCTensor *input,
+ long k, int dim, int dir, int sorted) {
+ THAssert(topK != NULL && indices != NULL && input != NULL);
+ THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, topK, indices, input));
+ THArgCheck(THCTensor_(nDimension)(state, topK) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+ long dims = THCudaLongTensor_nDimension(state, indices);
+ THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING);
+ int numDims = THCTensor_(nDimension)(state, input);
+ THArgCheck(numDims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
+
+ THArgCheck(dim >= 0 && dim < numDims, 6, "dim not in range");
+
+ long sliceSize = THCTensor_(size)(state, input, dim);
+ THArgCheck(k > 0 && k <= sliceSize, 5, "k not in range for dimension");
+
+ // Build the output size, which is the dim being selected set to
+ // size k
+ THLongStorage* topKSize = THCTensor_(newSizeOf)(state, input);
+ THLongStorage_set(topKSize, dim, k);
+ THCTensor_(resize)(state, topK, topKSize, NULL);
+ THCudaLongTensor_resize(state, indices, topKSize, NULL);
+ THLongStorage_free(topKSize);
+
+#define RUN_K(INDEX_T, DIM, DIR) \
+ gatherTopK<real, INDEX_T, DIM, DIR> \
+ <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
+ inputInfo, \
+ sliceSize, \
+ k, \
+ inputSlices, \
+ /* The actual dimension that the k-selection is running in */ \
+ /* may have changed from collapseDims() */ \
+ inputInfo.strides[collapseInputDim], \
+ topKInfo, \
+ topKSlices, \
+ topKInfo.strides[collapseTopKDim], \
+ indicesInfo, \
+ indicesInfo.strides[collapseIndicesDim])
+
+#define RUN_DIR(INDEX_T, DIM) \
+ if (dir) { \
+ RUN_K(INDEX_T, DIM, true); \
+ } else { \
+ RUN_K(INDEX_T, DIM, false); \
+ }
+
+#define RUN_DIM(INDEX_T) \
+ if (allDims == 1) { \
+ RUN_DIR(INDEX_T, 1); \
+ } else if (allDims == 2) { \
+ RUN_DIR(INDEX_T, 2); \
+ } else if (allDims == 3) { \
+ RUN_DIR(INDEX_T, 3); \
+ } else { \
+ RUN_DIR(INDEX_T, -1); \
+ }
+
+#define RUN_T(INDEX_T) \
+ TensorInfo<real, INDEX_T> inputInfo = \
+ getTensorInfo<THCTensor, INDEX_T>(state, input); \
+ TensorInfo<real, INDEX_T> topKInfo = \
+ getTensorInfo<THCTensor, INDEX_T>(state, topK); \
+ TensorInfo<long, INDEX_T> indicesInfo = \
+ getTensorInfo<THCudaLongTensor, INDEX_T>(state, indices); \
+ \
+ /* We use these structures solely to find the offset to */ \
+ /* each slice we are operating on */ \
+ inputInfo.sizes[dim] = 1; \
+ topKInfo.sizes[dim] = 1; \
+ indicesInfo.sizes[dim] = 1; \
+ \
+ /* Collapse all other dims */ \
+ int collapseInputDim = inputInfo.collapseDims(dim); \
+ int collapseTopKDim = topKInfo.collapseDims(dim); \
+ int collapseIndicesDim = indicesInfo.collapseDims(dim); \
+ \
+ long inputSlices = 1; \
+ long topKSlices = 1; \
+ for (int i = 0; i < numDims; ++i) { \
+ inputSlices *= inputInfo.sizes[i]; \
+ topKSlices *= topKInfo.sizes[i]; \
+ } \
+ \
+ dim3 grid; \
+ if (!THC_getGridFromTiles(inputSlices, grid)) { \
+ THError("Slice to sort is too large"); \
+ } \
+ \
+ dim3 block(std::min(THCRoundUp(sliceSize, 32L), 1024L)); \
+ \
+ /* This is used as a template parameter to calculate indices. */ \
+ /* We only specialize it if all collapsed dim sizes are the */ \
+ /* same; otherwise, we use -1 which is the specialization */ \
+ /* parameter for arbitrary dimensions */ \
+ int allDims = inputInfo.dims; \
+ if (topKInfo.dims != allDims || indicesInfo.dims != allDims) { \
+ allDims = -1; \
+ } \
+ \
+ RUN_DIM(INDEX_T);
+
+ // Based on required index size, run the algorithm with the
+ // appropriate index type
+ if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, input) &&
+ TensorUtils<THCTensor>::canUse32BitIndexMath(state, topK) &&
+ TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+ RUN_T(unsigned int);
+ } else {
+ RUN_T(unsigned long);
+ }
+#undef RUN_T
+#undef RUN_DIM
+#undef RUN_DIR
+#undef RUN_K
+
+ // Sort the results if the user wants them sorted, since our
+ // selection routine does not ensure sorting
+ if (sorted) {
+ // FIXME: the k/v inplace sort along slice only works for size <=
+ // 2048 at the moment
+ if (sliceSize <= 2048) {
+ // This avoids any memory allocations and performs all sorting
+ // work inplace along the slice
+ THCTensor_(sortKeyValueInplace)(state, topK, indices, dim, dir);
+ } else {
+ // Depend upon the backup sort that returns indices, which we
+ // can use in conjunction with gather to produce the original
+ // indices.
+ // This is not the most efficient implementation, especially since
+ // there are memory allocations performed here. If the user desires
+ // greater performance, they should torch.gather() the results
+ // themselves using the reported indices, providing previously
+ // allocated tensors to receive the results.
+ THCTensor* sortedTopK = THCTensor_(new)(state);
+ THCudaLongTensor* sortedIndices = THCudaLongTensor_new(state);
+ THCTensor_(sort)(state, sortedTopK, sortedIndices, topK, dim, dir);
+
+ THCudaLongTensor* sortedTopKIndices = THCudaLongTensor_new(state);
+
+ THCudaLongTensor_resizeAs(state, sortedTopKIndices, indices);
+ THCudaLongTensor_gather(state, sortedTopKIndices, indices, dim, sortedIndices);
+
+ THCTensor_(freeCopyTo)(state, sortedTopK, topK);
+ THCudaLongTensor_freeCopyTo(state, sortedTopKIndices, indices);
+ THCudaLongTensor_free(state, sortedIndices);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+#endif // THC_GENERIC_FILE
diff --git a/lib/THC/generic/THCTensorTopK.h b/lib/THC/generic/THCTensorTopK.h
new file mode 100644
index 0000000..2c281b5
--- /dev/null
+++ b/lib/THC/generic/THCTensorTopK.h
@@ -0,0 +1,13 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorTopK.h"
+#else
+
+/* Returns the set of all kth smallest (or largest) elements, depending */
+/* on `dir` */
+THC_API void THCTensor_(topk)(THCState* state,
+ THCTensor* topK,
+ THCudaLongTensor* indices,
+ THCTensor* input,
+ long k, int dim, int dir, int sorted);
+
+#endif // THC_GENERIC_FILE
diff --git a/rocks/cutorch-1.0-0.rockspec b/rocks/cutorch-1.0-0.rockspec
index 07e309e..d904a52 100644
--- a/rocks/cutorch-1.0-0.rockspec
+++ b/rocks/cutorch-1.0-0.rockspec
@@ -21,16 +21,15 @@ dependencies = {
build = {
type = "command",
build_command = [[
-
jopts=$(getconf _NPROCESSORS_CONF)
echo "Building on $jopts cores"
-cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DLUA_INCDIR=$(LUA_INCDIR) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install
]],
platforms = {
windows = {
build_command = [[
-cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DLUA_INCDIR=$(LUA_INCDIR) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
]]
}
},
diff --git a/rocks/cutorch-scm-1.rockspec b/rocks/cutorch-scm-1.rockspec
index 8314385..5dbdfbe 100644
--- a/rocks/cutorch-scm-1.rockspec
+++ b/rocks/cutorch-scm-1.rockspec
@@ -24,12 +24,12 @@ build = {
jopts=$(getconf _NPROCESSORS_CONF)
echo "Building on $jopts cores"
-cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DLUA_INCDIR=$(LUA_INCDIR) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install
]],
platforms = {
windows = {
build_command = [[
-cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DLUA_INCDIR=$(LUA_INCDIR) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
]]
}
},
diff --git a/test/test.lua b/test/test.lua
index 32918b1..bd78a4f 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -175,13 +175,13 @@ local function createTestTensor(maxSize)
end
local function isEqual(x, y, tolerance, ...)
- if a == nil and b == nil then return true end
- if a == nil and b ~= nil then return false end
- if a ~= nil and b == nil then return false end
+ if x == nil and y == nil then return true end
+ if x == nil and y ~= nil then return false end
+ if x ~= nil and y == nil then return false end
- -- clone the tensors so we can modify the contents if necessary for testing
- local a = x:clone()
- local b = y:clone()
+ -- if x, y are tensors clone them so we can modify the contents if necessary for testing
+ local a = type(x) ~= 'number' and x:clone() or x
+ local b = type(y) ~= 'number' and y:clone() or y
if torch.type(b) ~= torch.type(a) then
b = b:typeAs(a) -- TODO: remove the need for this (a-b doesnt work for bytetensor, cudatensor pairs)
@@ -271,7 +271,6 @@ local function compareFloatAndCuda(x, fn, ...)
.. "are different for function '%s'", tostring(fn)))
for k, _ in ipairs(rcpu) do
if not isEqual(rcpu[k], rcuda[k], tolerance) then
- print(args)
tester:assert(false, errstr)
end
end
@@ -365,10 +364,11 @@ end
-- indexMode = true: keep indexing and masking Tensors as their CPU equivalents
-- false: convert then to baseType when doing CUDA
-- x = first argument tensor
+-- limit: number of returns to compare, if nil, compares all returns
-- gpu2cpu_map = map of gpu types to cpu types
-- fn = function name (as string), or the function)
-- ... = the rest of arguments to fn
-local function compareCPUAndCUDATypeTensorArgsWithConv(cudaType, gpu2cpu_map, indexMode, x, fn, ...)
+local function compareCPUAndCUDATypeTensorArgsWithConvInternal(cudaType, gpu2cpu_map, indexMode, limit, x, fn, ...)
local baseType = t2cpu[cudaType]
assert(baseType, 'Cannot find baseType for ' .. cudaType)
local x_cpu = x:type(baseType)
@@ -421,23 +421,30 @@ local function compareCPUAndCUDATypeTensorArgsWithConv(cudaType, gpu2cpu_map, in
tester:assert(#rcpu == #rcuda,
string.format("number of return arguments for CPU and CUDA "
.. "are different for function '%s'", tostring(fn)))
- for k, _ in ipairs(rcpu) do
- tester:assert(isEqual(rcpu[k], rcuda[k], tolerance),
- string.format(errstrval, k, divval(rcpu[k], rcuda[k])))
+
+ if limit ~= nil then
+ for k = 1, limit do
+ tester:assert(isEqual(rcpu[k], rcuda[k], tolerance),
+ string.format(errstrval, k, divval(rcpu[k], rcuda[k])))
+ end
+ else
+ for k, _ in ipairs(rcpu) do
+ tester:assert(isEqual(rcpu[k], rcuda[k], tolerance),
+ string.format(errstrval, k, divval(rcpu[k], rcuda[k])))
+ end
end
+
-- also test x in case function changed object
tester:assert(isEqual(x_cpu, x_cuda, tolerance),
string.format(errstrobj, divval(x_cpu, x_cuda)))
end
--- baseType = the tensor type to test
--- indexMode = true: keep indexing and masking Tensors as their CPU equivalents
--- false: convert then to baseType when doing CUDA
--- x = first argument tensor
--- fn = function name (as string), or the function)
--- ... = the rest of arguments to fn
local function compareCPUAndCUDATypeTensorArgs(cudaType, indexMode, x, fn, ...)
- compareCPUAndCUDATypeTensorArgsWithConv(cudaType, nil, indexMode, x, fn, ...)
+ compareCPUAndCUDATypeTensorArgsWithConvInternal(cudaType, nil, indexMode, nil, x, fn, ...)
+end
+
+local function compareCPUAndCUDATypeTensorArgsWithLimit(cudaType, indexMode, limit, x, fn, ...)
+ compareCPUAndCUDATypeTensorArgsWithConvInternal(cudaType, nil, indexMode, limit, x, fn, ...)
end
function test.squeeze()
@@ -816,7 +823,7 @@ function test.copyAsync()
cutorch.streamSynchronize(cutorch.getStream())
tester:assertTensorEq(device_tensor:double(), host_tensor:double(), 0,
"Async copy to host failed.")
- end
+ end
end
function test.largeNoncontiguous()
@@ -882,6 +889,98 @@ function test.ones()
torch.setdefaulttensortype(t)
end
+function test.linspace()
+ local sz1 = chooseInt(minsize, maxsize)
+ local sz2 = chooseInt(minsize, maxsize)
+ local n = sz1 * sz2
+ local a = torch.uniform()
+ local b = torch.uniform()
+ local x = torch.FloatTensor():rand(sz1, sz2)
+ for k, typename in ipairs(float_typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'linspace', a, b, n)
+ end
+ checkMultiDevice(x, 'linspace', a, b, n)
+
+ -- Check range for non-contiguous tensors.
+ local x = createTestTensorWithSizes(true, true, {sz1, sz2})
+ for k, typename in ipairs(float_typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'linspace', a, b, n)
+ end
+ checkMultiDevice(x, 'linspace', a, b, n)
+
+ -- Ckeck new tensor creation
+ local x = torch.FloatTensor()
+ for k, typename in ipairs(float_typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'linspace', a, b, n)
+ end
+ checkMultiDevice(x, 'linspace', a, b, n)
+
+ -- Ckeck n = 1 case
+ local x = torch.rand(1)
+ for k, typename in ipairs(float_typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'linspace', a, a, 1)
+ end
+ checkMultiDevice(x, 'linspace', a, a, 1)
+
+ -- Ckeck default parameter case
+ local x = createTestTensorWithSizes(true, true, {100})
+ for k, typename in ipairs(float_typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'linspace', a, b)
+ end
+ checkMultiDevice(x, 'linspace', a, b)
+end
+
+function test.logspace()
+ local sz1 = chooseInt(minsize, maxsize)
+ local sz2 = chooseInt(minsize, maxsize)
+ local n = sz1 * sz2
+ local a = torch.uniform()
+ local b = torch.uniform()
+ local x = torch.FloatTensor():rand(sz1, sz2)
+ for k, typename in ipairs(float_typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'logspace', a, b, n)
+ end
+ checkMultiDevice(x, 'logspace', a, b, n)
+
+ -- Check range for non-contiguous tensors.
+ local x = createTestTensorWithSizes(true, true, {sz1, sz2})
+ for k, typename in ipairs(float_typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'logspace', a, b, n)
+ end
+ checkMultiDevice(x, 'logspace', a, b, n)
+
+ -- Ckeck new tensor creation
+ local x = torch.FloatTensor()
+ for k, typename in ipairs(float_typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'logspace', a, b, n)
+ end
+ checkMultiDevice(x, 'logspace', a, b, n)
+
+ -- Ckeck n = 1 case
+ local x = torch.rand(1)
+ for k, typename in ipairs(float_typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'logspace', a, a, 1)
+ end
+ checkMultiDevice(x, 'logspace', a, a, 1)
+
+ -- Ckeck default parameter case
+ local x = createTestTensorWithSizes(true, true, {100})
+ for k, typename in ipairs(float_typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'logspace', a, b)
+ end
+ checkMultiDevice(x, 'logspace3', a, b)
+end
+
function test.add()
local sz1 = chooseInt(minsize, maxsize)
@@ -904,6 +1003,35 @@ function test.add()
checkMultiDevice(x, 'add', y, v, z)
end
+local test_bitops = function(funcname, tmin, tmax, vmin, vmax)
+ local sz1 = chooseInt(minsize, maxsize)
+ local sz2 = chooseInt(minsize, maxsize)
+ local x = torch.IntTensor(sz1, sz2):random(tmin, tmax)
+ local v = torch.random(vmin, vmax)
+ compareCPUAndCUDATypeTensorArgs('torch.CudaIntTensor', nil, x, funcname, v)
+ checkMultiDevice(x, funcname, v)
+end
+
+function test.lshift()
+ test_bitops('lshift', 1, 1000, 1, 10)
+end
+
+function test.rshift()
+ test_bitops('rshift', 1000, 1000000, 1, 10)
+end
+
+function test.bitand()
+ test_bitops('bitand', 1, 1000, 1, 255)
+end
+
+function test.bitor()
+ test_bitops('bitor', 1, 1000, 1, 255)
+end
+
+function test.bitxor()
+ test_bitops('bitxor', 1, 1000, 1, 255)
+end
+
function test.csub()
local sz1 = chooseInt(minsize, maxsize)
local sz2 = chooseInt(minsize, maxsize)
@@ -1481,6 +1609,60 @@ function test.diag()
checkMultiDevice(y1, 'diag', k)
end
+function test.range()
+ local xmin = chooseInt(minsize, maxsize)
+ local xmax = chooseInt(xmin, maxsize)
+ local step = 3
+ local size = math.floor((xmax - xmin) / step + 1)
+ -- Base case
+ local x = torch.FloatTensor():rand(size)
+ for k, typename in ipairs(typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmin, xmax, step)
+ end
+ checkMultiDevice(x, 'range', xmin, xmax, step)
+
+ -- Check range for non-contiguous tensors.
+ local x = createTestTensorWithSizes(true, true, {size})
+ for k, typename in ipairs(typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmin, xmax, step)
+ end
+ checkMultiDevice(x, 'range', xmin, xmax, step)
+
+ -- Ckeck new tensor creation
+ local x = torch.Tensor()
+ for k, typename in ipairs(typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmin, xmax, step)
+ end
+ checkMultiDevice(x, 'range', xmin, xmax, step)
+
+ -- Ckeck negative step case
+ for k, typename in ipairs(typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmax, xmin, -step)
+ end
+ checkMultiDevice(x, 'range', xmax, xmin, -step)
+
+ -- Ckeck default parameter case
+ local x = createTestTensorWithSizes(true, true, {size})
+ for k, typename in ipairs(typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmin, xmax)
+ end
+ checkMultiDevice(x, 'range', xmin, xmax, step)
+
+ -- Ckeck floating step case
+ local step = 1.3
+ local x = torch.Tensor()
+ for k, typename in ipairs(float_typenames) do
+ local x = x:type(t2cpu[typename])
+ compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmin, xmax)
+ end
+ checkMultiDevice(x, 'range', xmin, xmax, step)
+end
+
function test.trace()
local sz1 = chooseInt(minsize, maxsize)
local sz2 = chooseInt(minsize, maxsize)
@@ -1877,10 +2059,10 @@ local function testIndexAdd(types, gpu2cpu_map)
for k, typename in ipairs(types) do
local ctype = t2cpu[typename]
local x, src = x:type(ctype), src:type(ctype)
- compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, true, x, 'indexAdd',
+ compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, true, nil, x, 'indexAdd',
index, longIndex, src)
if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
- compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, false, x, 'indexAdd',
+ compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, false, nil, x, 'indexAdd',
index, longIndex, src)
end
end
@@ -1892,10 +2074,10 @@ local function testIndexAdd(types, gpu2cpu_map)
for k, typename in ipairs(types) do
local ctype = t2cpu[typename]
local x, src = x:type(ctype), src:type(ctype)
- compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, true, x, 'indexAdd',
+ compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, true, nil, x, 'indexAdd',
index, longIndex, src)
if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
- compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, false, x, 'indexAdd',
+ compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, false, nil, x, 'indexAdd',
index, longIndex, src)
end
end
@@ -1908,10 +2090,10 @@ local function testIndexAdd(types, gpu2cpu_map)
for k, typename in ipairs(types) do
local ctype = t2cpu[typename]
local x, src = x:type(ctype), src:type(ctype)
- compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, true, x, 'indexAdd',
+ compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, true, nil, x, 'indexAdd',
index, longIndex, src)
if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
- compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, false, x, 'indexAdd',
+ compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, false, nil, x, 'indexAdd',
index, longIndex, src)
end
end
@@ -3593,43 +3775,482 @@ function test.sort()
tester:assert(isEqual(gather_cpu, gather_gpu), 'indices mismatch')
end
+local function explore(typename, func, t, topk, indices)
+ if t:nDimension() == 1 then
+ func(typename, t, topk, indices)
+ else
+ for i = 1, t:size(1) do
+ explore(typename, func, t[i], topk[i], indices[i])
+ end
+ end
+end
+
function test.topk()
- local function runTopK(t, dim, k, dir)
- -- FIXME: if the tensors ever contain equivalent values, then their indices
- -- could in fact be different.
+ -- need to ensure unique values for index checking, so for the first pass we create Tensors
+ -- with sizes less than the maximum range of values for that type
+ local counts = {}
+ counts['torch.CudaByteTensor'] = 255
+ counts['torch.CudaCharTensor'] = 255
+ counts['torch.CudaShortTensor'] = 65536
+ counts['torch.CudaIntTensor'] = 2 ^ 20
+ counts['torch.CudaTensor'] = 2 ^ 20
+ counts['torch.CudaLongTensor'] = 2 ^ 20
+ counts['torch.CudaDoubleTensor'] = 2 ^ 20
+ counts['torch.CudaHalfTensor'] = 32768
- if torch.Tensor.type(t) == 'torch.CudaTensor' then
- return t:topk(k, dim, dir, true)
- else
- local sorted, indices = t:sort(dim, dir)
- return sorted:narrow(dim, 1, k), indices:narrow(dim, 1, k)
+ for _, typename in ipairs(typenames) do
+ for tries = 1, 5 do
+ local t = createTestTensor(counts[typename]):type(typename)
+ local dim = chooseInt(1, t:nDimension())
+ local dimSize = t:size(dim)
+ local dir = chooseInt(1, 2) == 1
+
+ -- Test boundary conditions
+ local kTests = {1, dimSize}
+
+ -- and some other random ones
+ table.insert(kTests, chooseInt(1, dimSize))
+ for i = 1, 2 do
+ -- some sizes that fit in our inplace kernel range (the dimSize one
+ -- will fall back to Thrust)
+ table.insert(kTests, chooseInt(1, math.min(2048, dimSize)))
+ end
+
+ for k = 1, #kTests do
+ compareCPUAndCUDATypeTensorArgsWithLimit(typename, nil, 1, t, 'topk', kTests[k], dim, dir, true)
+
+ -- verify that indices picked yield topk value in original tensor
+ local topk, indices = t:topk(kTests[k], dim, dir, true)
+ local verify = function(typename, t, topk, indices)
+ t = t:type(t2cpu[typename])
+ indices = indices:long()
+ topk = topk:type(t2cpu[typename])
+ for i = 1, indices:size(1) do
+ tester:assert(t[indices[i]] == topk[i])
+ end
+ end
+
+ local tt = t:transpose(dim, t:nDimension())
+ local ttk = topk:transpose(dim, topk:nDimension())
+ local tti = indices:transpose(dim, indices:nDimension())
+
+ explore(typename, verify, tt, ttk, tti)
+ end
end
end
+end
- for tries = 1, 5 do
- -- max size 2^20 for indexing
- local t = createTestTensor(2 ^ 20)
- local dim = chooseInt(1, t:nDimension())
- local dimSize = t:size(dim)
- local dir = chooseInt(1, 2) == 1
+local function verifyMode1D(tensor)
+ -- We cannot rely upon comparing against CPU-Torch as the way it resolves
+ -- ties between equal modes and how it picks the corresponding index is not
+ -- reliable. Instead we will use apply macros to compute the mode in place in
+ -- our code and compare against these results
+
+ -- counts is a table of tensor element -> # of occurrences
+ local counts = {}
+
+ -- populate counts by iterating over the elements in the tensor
+ tensor:apply(function(x) if counts[x] == nil then counts[x] = 1 else counts[x] = counts[x] + 1 end return x end)
+
+ -- next, calculate the max occurrence in the tensor
+ local max = -1;
+ for _, count in pairs(counts) do
+ if count > max then max = count end
+ end
+
+ -- now verify for all the GPU types that 1. the mode picked has max occurrences,
+ -- and 2. that the index returned contains that mode
+
+ for _, cudaType in ipairs(typenames) do
+ local baseType = t2cpu[cudaType]
+ assert(baseType, 'Cannot find baseType for ' .. cudaType)
+ local x_cpu = tensor:clone():type(baseType)
+ local x_cuda = cloneExactlyToGPUType(x_cpu, nil, t2gpu)
+
+ local modes, indices = x_cuda:mode()
+
+ -- 1D, so should only be a single return
+ tester:assert(modes:nElement() == 1, 'mode returned an invalid number of values')
+ tester:assert(indices:nElement() == 1, 'mode returned an invalid number of values')
+ local mode = modes[1]
+ local index = indices[1]
+
+ tester:assert(counts[mode] == max, string.format(
+ 'Type: %s --> Selected mode of %s which has count of %s, but mode must have %s occurrences',
+ cudaType, tostring(mode), tostring(counts[mode]), tostring(max)
+ ))
+ tester:assert(tensor[index] == mode, string.format(
+ 'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+ cudaType, tostring(index), tostring(tensor[index]), tostring(mode)
+ ))
+ end
+end
- -- Test boundary conditions
- local kTests = {1, dimSize}
+local function assertSize(tensor, sizes)
+ local valid = true
+ if tensor:nDimension() ~= #sizes then
+ tester:assert(false, 'tensor dimension mismatch')
+ end
+ for i, size in ipairs(sizes) do
+ if tensor:size(i) ~= size then
+ valid = false
+ end
+ end
+ tester:assert(valid, 'tensor size mismatch')
+end
+
+local function verifyMode2D(tensor, onlyDim)
+ local dims = {}
+ if onlyDim ~= nil then
+ dims = {onlyDim}
+ else
+ dims = {1, 2}
+ end
+
+ for _, dim in ipairs(dims) do
+ -- In the case of a 2D Tensor, we need to calculate the count for each slice
+ -- sCounts is a table containing the counts of elements for each slice,
+ -- sMax is a table containing the max occurrence for each slice
+ local sCounts = {}
+ local sMax = {}
+
+ -- First, we use the :split() function to split the Tensor
+ -- Suppose we are mode'ing a 5x10 Tensor. If we mode along dim=1,
+ -- we have a result Tensor that is 1x10, so we need the counts for
+ -- all 10 slices of size=5. So we actually split along dim=2, with
+ -- size = 1, to yield 10 5x1 tensors
+ local splits = tensor:split(1, dim == 1 and 2 or 1)
+
+ -- next, we iterate over these split Tensors to calculate the mode, as we
+ -- did in the 1D case
+ for i, slice in pairs(splits) do
+ local counts = {}
+ slice:apply(function(x) if counts[x] == nil then counts[x] = 1 else counts[x] = counts[x] + 1 end return x end)
+
+ local max = -1;
+ for _, count in pairs(counts) do
+ if count > max then max = count end
+ end
- -- and some other random ones
- table.insert(kTests, chooseInt(1, dimSize))
- for i = 1, 2 do
- -- some sizes that fit in our inplace kernel range (the dimSize one
- -- will fall back to Thrust)
- table.insert(kTests, chooseInt(1, math.min(2048, dimSize)))
+ sCounts[i] = counts;
+ sMax[i] = max;
end
- for k = 1, #kTests do
- compareFloatAndCuda(t, runTopK, dim, kTests[k], dir)
+ -- verification pass
+ for _, cudaType in ipairs(typenames) do
+ local baseType = t2cpu[cudaType]
+ assert(baseType, 'Cannot find baseType for ' .. cudaType)
+ local x_cpu = tensor:clone():type(baseType)
+ local x_cuda = cloneExactlyToGPUType(x_cpu, nil, t2gpu)
+ local modes, indices = x_cuda:mode(dim)
+
+ -- 2D, so expect:
+ -- (dim = 1) a 1xsize(tensor, dim = 2) tensor
+ -- (dim = 2) a size(tensor, dim = 1)x1 tensor
+
+ if dim == 1 then
+ assertSize(modes, {1, tensor:size(2)})
+ assertSize(indices, {1, tensor:size(2)})
+ else
+ assertSize(modes, {tensor:size(1), 1})
+ assertSize(indices, {tensor:size(1), 1})
+ end
+
+ -- we need to run through and verify that all of the modes/indices are valid, for
+ -- the results of each slice. First, we squeeze the Tensor, so we can iterate over
+ -- both the 1D/2D values in the same manner
+ modes = modes:squeeze(dim)
+ indices = indices:squeeze(dim)
+
+ -- iterate over each slice, and verify that for each slice the mode selected has
+ -- max occurrences, and the index points to the mode
+ for i, counts in pairs(sCounts) do
+ local max = sMax[i]
+ local mode = modes[i]
+ local index = indices[i]
+
+ tester:assert(counts[mode] == max, string.format(
+ 'Type: %s --> Selected mode of %s which has count of %s, but mode must have %s occurrences',
+ cudaType, tostring(mode), tostring(counts[mode]), tostring(max)
+ ))
+
+ if dim == 1 then
+ tester:assert(tensor[index][i] == mode, string.format(
+ 'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+ cudaType, tostring(index), tostring(tensor[index][i]), tostring(mode)
+ ))
+ else
+ tester:assert(tensor[i][index] == mode, string.format(
+ 'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+ cudaType, tostring(index), tostring(tensor[i][index]), tostring(mode)
+ ))
+ end
+ end
end
end
end
+local function verifyMode3D(tensor, onlyDim)
+ local dims = {}
+ if onlyDim ~= nil then
+ dims = {onlyDim}
+ else
+ dims = {1, 2, 3}
+ end
+ -- In the case of 3D Tensor, we need to calculate the count for each slice,
+ -- but this time, we have two layers of depth, for each of the non-mode dims
+ -- so sCounts is a multi-level table where sCounts[i][j] is the counts for
+ -- (_, i, j), (i, _, j) or (i, j, _) depending on the dim
+ local sCounts = {}
+ local sMax = {}
+
+ -- Suppose we have a 2x3x4 Tensor T:
+ -- (1, .., ..), (2, .., ..)
+ -- [1, 2, 3, 4] [3, 2, 2, 4]
+ -- [5, 6, 7, 8] [5, 6, 8, 7]
+ -- [9, 10, 11, 12] [1, 10, 11, 1]
+ --
+ -- Then for dim = 1, we need counts to be a multi-level table (3x4xcounts)
+ -- 2 (2x4xcounts)
+ -- 3 (2x3xcounts)
+ --
+ -- Results: dim = 1
+ -- {1:
+ -- {1:
+ -- 1 --> 1,
+ -- 3 --> 1,
+ -- 2:
+ -- 2 --> 2,
+ -- 3:
+ -- 2 --> 1,
+ -- 3 --> 1,
+ -- 4:
+ -- 4 --> 2,
+ -- },
+ -- {2:
+ -- {1:
+ -- 5 --> 2,
+ -- ...
+
+ -- used to set loop bounds and indexing to construct the above table using the loop below
+ local dbounds = {
+ {tensor:size(2), tensor:size(3), tensor:size(1)},
+ {tensor:size(1), tensor:size(3), tensor:size(2)},
+ {tensor:size(1), tensor:size(2), tensor:size(3)}}
+ local dfuncs = {
+ function(tensor, i, j, k) return tensor[k][i][j] end,
+ function(tensor, i, j, k) return tensor[i][k][j] end,
+ function(tensor, i, j, k) return tensor[i][j][k] end,
+ }
+
+ -- loop...
+ for d, bounds in ipairs(dbounds) do
+ sCounts[d] = {}
+ sMax[d] = {}
+ for i = 1, bounds[1] do
+ sCounts[d][i] = {}
+ sMax[d][i] = {}
+ for j = 1, bounds[2] do
+ sCounts[d][i][j] = {}
+ sMax[d][i][j] = {}
+ for k = 1, bounds[3] do
+ local v = dfuncs[d](tensor, i, j, k)
+ if sCounts[d][i][j][v] == nil then
+ sCounts[d][i][j][v] = 1
+ else
+ sCounts[d][i][j][v] = sCounts[d][i][j][v] + 1
+ end
+
+ local max = -1
+ for _, count in pairs(sCounts[d][i][j]) do
+ if count > max then max = count end
+ end
+ sMax[d][i][j] = max
+ end -- k
+ end -- k
+ end -- j
+ end -- d
+
+
+ -- verification pass
+ for _, dim in ipairs(dims) do
+ for _, cudaType in ipairs(typenames) do
+ local baseType = t2cpu[cudaType]
+ assert(baseType, 'Cannot find baseType for ' .. cudaType)
+ local x_cpu = tensor:clone():type(baseType)
+ local x_cuda = cloneExactlyToGPUType(x_cpu, nil, t2gpu)
+ local modes, indices = x_cuda:mode(dim)
+
+ if dim == 1 then
+ assertSize(modes, {1, tensor:size(2), tensor:size(3)})
+ assertSize(indices, {1, tensor:size(2), tensor:size(3)})
+ elseif dim == 2 then
+ assertSize(modes, {tensor:size(1), 1, tensor:size(3)})
+ assertSize(indices, {tensor:size(1), 1, tensor:size(3)})
+ else
+ assertSize(modes, {tensor:size(1), tensor:size(2), 1})
+ assertSize(indices, {tensor:size(1), tensor:size(2), 1})
+ end
+
+ -- squeeze on mode dim
+ modes = modes:squeeze(dim)
+ indices = indices:squeeze(dim)
+
+ -- iterate over slices
+ for i, js in pairs(sCounts[dim]) do
+ for j, counts in pairs(js) do
+ local max = sMax[dim][i][j]
+ local mode = modes[i][j]
+ local index = indices[i][j]
+
+ tester:assert(counts[mode] == max, string.format(
+ 'Type: %s --> Selected mode of %s which has count of %s, but mode must have %s occurrences',
+ cudaType, tostring(mode), tostring(counts[mode]), tostring(max)
+ ))
+
+ if dim == 1 then
+ tester:assert(tensor[index][i][j] == mode, string.format(
+ 'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+ cudaType, tostring(index), tostring(tensor[index][i][j]), tostring(mode)
+ ))
+ elseif dim == 2 then
+ tester:assert(tensor[i][index][j] == mode, string.format(
+ 'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+ cudaType, tostring(index), tostring(tensor[i][index][j]), tostring(mode)
+ ))
+ else
+ tester:assert(tensor[i][j][index] == mode, string.format(
+ 'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+ cudaType, tostring(index), tostring(tensor[i][j][index]), tostring(mode)
+ ))
+ end
+
+ end -- j
+ end --i
+ end -- tensor type
+ end -- dim
+end
+
+function test.mode()
+ -- Tests for 1D Tensors
+
+ -- Single-element Tensor
+ local input = torch.FloatTensor({1})
+ verifyMode1D(input)
+
+ -- Tensor of all the same values
+ local input = torch.FloatTensor(10):fill(1)
+ verifyMode1D(input)
+
+ -- Tensor with a unique range of values
+ local input = torch.FloatTensor({4, 3, 6, 8, 2, 1})
+ verifyMode1D(input)
+
+ -- Handles ties when there are two things with equal counts
+ local input = torch.FloatTensor({2, 2, 1, 1})
+ verifyMode1D(input)
+
+ -- Big Range of Values: (4 is the mode)
+ local input = torch.FloatTensor({
+ 1, 4, 4, 4, 4, 1, 1, 2, 2, 2, 3, 4, 5, 5, 4, 4, 4, 4, 4, 4,
+ 2, 2, 1, 1, 2, 3, 4, 4, 4, 4, 2, 3, 4, 4, 3, 2, 1, 2, 3, 4})
+ verifyMode1D(input)
+
+ -- Larger Example
+ local input = torch.FloatTensor(1000):apply(function(x) return torch.random(1, 10) end)
+ verifyMode1D(input)
+
+ -- verify input is unchanged
+ local input = torch.FloatTensor({4, 3, 6, 8, 2, 1})
+ local same = torch.FloatTensor({4, 3, 6, 8, 2, 1})
+ torch.mode(input)
+ tester:assertTensorEq(input, same, 0, 'cutorch mode modified input')
+
+ -- Tests for 2D Tensors
+
+ -- Tensor of all the same values
+ local input = torch.FloatTensor(3, 4):fill(1)
+ verifyMode2D(input)
+
+ -- Tensor with a unique range of values
+ local input = torch.FloatTensor({{2, 3, 5, 7},
+ {1, 10, 17, 6},
+ {0, 22, 14, 9}})
+ verifyMode2D(input)
+
+ -- Consistency between ties when there are two things with equal counts
+ local input = torch.FloatTensor({{2, 2, 3, 3},
+ {1, 1, 3, 3},
+ {2, 2, 1, 1},
+ {1, 1, 1, 1}})
+ verifyMode2D(input)
+
+ -- Larger example
+ local input = torch.FloatTensor(50, 100):apply(function(x) return torch.random(1, 10) end)
+ verifyMode2D(input)
+
+ -- Tests for 3D Tensors
+
+ -- Tensor of all the same values
+ local input = torch.FloatTensor(2, 4, 5):fill(1)
+ verifyMode3D(input)
+
+ -- Tensor with a unique range of values
+ local input = torch.FloatTensor(
+ {
+ {{2, 3, 5, 7},
+ {1, 10, 17, 6},
+ {0, 22, 14, 9}},
+
+ {{32, 88, 25, 4},
+ {21, 78, 57, 111},
+ {15, 68, 64, 117}}
+ }
+ )
+ verifyMode3D(input)
+
+ -- Handles ties when there are two things with equal counts
+ local input = torch.FloatTensor(
+ {
+ {{2, 2, 3, 3},
+ {1, 1, 3, 3},
+ {2, 2, 1, 1},
+ {1, 1, 1, 1}},
+
+ {{3, 3, 4, 4},
+ {2, 2, 4, 4},
+ {3, 3, 2, 2},
+ {2, 2, 2, 2}},
+ }
+ )
+ verifyMode3D(input)
+
+ -- Larger example
+ local input = torch.FloatTensor(14, 22, 32):apply(function(x) return torch.random(1, 10) end)
+ verifyMode3D(input)
+end
+
+function test.bigmode()
+ -- Examples that overflow fused-kernel
+ local input = torch.IntTensor(16384):apply(function(x) return torch.random(1, 100) end)
+ verifyMode1D(input)
+
+ local input = torch.FloatTensor(4096, 4):fill(1)
+ verifyMode2D(input, 1)
+
+ local input = torch.FloatTensor(4, 4096):fill(1)
+ verifyMode2D(input, 2)
+
+ local input = torch.FloatTensor(2, 2, 4096):fill(1)
+ verifyMode3D(input, 3)
+
+ local input = torch.FloatTensor(2, 4096, 2):fill(1)
+ verifyMode3D(input, 2)
+
+ local input = torch.FloatTensor(4096, 2, 2):fill(1)
+ verifyMode3D(input, 1)
+end
+
function test.cat()
for k, typename in ipairs(typenames) do
for dim = 1, 3 do
@@ -3661,6 +4282,32 @@ function test.cat()
end
end
+function test.catNoDim()
+ for k, typename in ipairs(typenames) do
+ local a
+ local b
+ local c
+
+ a = torch.Tensor(minsize):uniform():type(typename)
+ b = torch.Tensor(minsize):uniform():type(typename)
+ c = torch.cat(a, b)
+ tester:assertTensorEq(c:narrow(1, 1, minsize), a, 0, 'torch.cat value')
+ tester:assertTensorEq(c:narrow(1, minsize + 1, minsize), b, 0, 'torch.cat value')
+
+ a = torch.Tensor(1, minsize):uniform():type(typename)
+ b = torch.Tensor(1, minsize):uniform():type(typename)
+ c = torch.cat(a, b)
+ tester:assertTensorEq(c:narrow(2, 1, minsize), a, 0, 'torch.cat value')
+ tester:assertTensorEq(c:narrow(2, minsize + 1, minsize), b, 0, 'torch.cat value')
+
+ a = torch.Tensor(10, minsize):uniform():type(typename)
+ b = torch.Tensor(10, minsize):uniform():type(typename)
+ c = torch.cat(a, b)
+ tester:assertTensorEq(c:narrow(2, 1, minsize), a, 0, 'torch.cat value')
+ tester:assertTensorEq(c:narrow(2, minsize + 1, minsize), b, 0, 'torch.cat value')
+ end
+end
+
function test.catArray()
for k, typename in ipairs(typenames) do
for dim = 1, 3 do
@@ -4157,7 +4804,7 @@ function test.kernelP2PAccess()
end
end
-if os.getenv('THC_CACHING_ALLOCATOR') == '1' then
+if os.getenv('THC_CACHING_ALLOCATOR') ~= '0' then
local function getCyclesPerMs()
cutorch.synchronize()
local t = torch.Timer()
@@ -4170,8 +4817,8 @@ if os.getenv('THC_CACHING_ALLOCATOR') == '1' then
local cyclesPerMs = getCyclesPerMs()
-- check that allocations are re-used after deletion
- t = cutorch.createCudaHostTensor({1})
- ptr = t:data()
+ local t = cutorch.createCudaHostTensor({1})
+ local ptr = t:data()
t = nil; collectgarbage()
t = cutorch.createCudaHostTensor({1})
tester:asserteq(t:data(), ptr, 'allocation not reused')
@@ -4184,6 +4831,31 @@ if os.getenv('THC_CACHING_ALLOCATOR') == '1' then
t = cutorch.createCudaHostTensor({1})
tester:assertne(t:data(), ptr, 'allocation re-used too soon')
end
+
+ function test.cachedPinnedMemoryMultiGPU()
+ local device_count = cutorch.getDeviceCount()
+ if device_count < 2 then
+ return
+ end
+
+ local cyclesPerMs = getCyclesPerMs()
+ local t = cutorch.createCudaHostTensor(1)
+ local ptr = t:data()
+ t[1] = 1
+
+ local gpu_tensor1 = torch.CudaTensor({0})
+
+ cutorch.setDevice(2)
+ local gpu_tensor2 = torch.CudaTensor({0})
+ cutorch._sleep(50 * cyclesPerMs) -- delay the copy
+ gpu_tensor2:copyAsync(t)
+
+ cutorch.setDevice(1)
+ t = nil; collectgarbage();
+ t = cutorch.createCudaHostTensor(1)
+ tester:assertne(t:data(), ptr, 'allocation re-used too soon')
+ end
+
end
-- unfortunately, torch.Tester() forgot setUp and tearDown functions.
diff --git a/torch/utils.h b/torch/utils.h
index ae959b7..8d3c455 100644
--- a/torch/utils.h
+++ b/torch/utils.h
@@ -26,7 +26,7 @@
# define TORCH_API TORCH_EXTERNC
#endif
-#if LUA_VERSION_NUM == 501
+#ifndef HAS_LUAL_SETFUNCS
/*
** Adapted from Lua 5.2.0
*/
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-cutorch.git
More information about the debian-science-commits
mailing list