[lua-torch-cutorch] 01/03: New upstream version 0~20170511-g92e9c08

Mon May 22 04:27:46 UTC 2017

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-cutorch.

commit a035f60e90e1740d09a34092194e7c65183f557c
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Mon May 22 04:27:07 2017 +0000

    New upstream version 0~20170511-g92e9c08
---
 CMakeLists.txt                            |  14 +
 README.md                                 |   1 +
 TensorMath.lua                            | 144 +++++-
 init.c                                    |  27 ++
 init.lua                                  |  30 +-
 lib/THC/CMakeLists.txt                    |  71 ++-
 lib/THC/THC.h                             |   1 -
 lib/THC/THCAsmUtils.cuh                   |  50 +-
 lib/THC/THCAtomics.cuh                    |   1 +
 lib/THC/THCBlas.cu                        |  33 ++
 lib/THC/THCBlas.h                         |   4 +
 lib/THC/THCCachingAllocator.cpp           | 159 +++++-
 lib/THC/THCCachingAllocator.h             |  10 +
 lib/THC/THCCachingHostAllocator.cpp       |  84 +++-
 lib/THC/THCCachingHostAllocator.h         |   3 +-
 lib/THC/THCGeneral.c                      | 231 ++++++++-
 lib/THC/THCGeneral.h.in                   |  32 +-
 lib/THC/THCHalf.cu                        |  90 ----
 lib/THC/THCNumerics.cuh                   |  30 +-
 lib/THC/THCReduce.cuh                     |   7 +-
 lib/THC/THCReduceAll.cuh                  |   2 +-
 lib/THC/THCReduceApplyUtils.cuh           | 114 +++--
 lib/THC/THCScanUtils.cuh                  | 122 ++++-
 lib/THC/THCSortUtils.cu                   |  17 +
 lib/THC/THCSortUtils.cuh                  |  70 ++-
 lib/THC/THCStream.c                       |  30 --
 lib/THC/THCStream.cpp                     |  60 +++
 lib/THC/THCStream.h                       |   2 +
 lib/THC/THCTensorConv.cu                  |   8 +-
 lib/THC/THCTensorCopy.h                   |   1 +
 lib/THC/THCTensorMath.cu                  |  26 +
 lib/THC/THCTensorMath.h                   |   6 +
 lib/THC/THCTensorMath2.cu                 |   2 +-
 lib/THC/THCTensorMathPairwise.cu          |  80 ++-
 lib/THC/THCTensorMathPointwise.cuh        | 129 ++++-
 lib/THC/THCTensorMathReduce.cu            |   4 +-
 lib/THC/THCTensorMathReduce.cuh           |  11 +-
 lib/THC/THCTensorMathScan.cu              |  10 +-
 lib/THC/THCTensorMode.cu                  |  16 +
 lib/THC/THCTensorMode.cuh                 | 282 +++++++++++
 lib/THC/THCTensorRandom.cuh               |  65 ++-
 lib/THC/THCTensorScatterGather.cu         |   9 +-
 lib/THC/THCTensorSort.cu                  |  16 -
 lib/THC/THCTensorSort.cuh                 |   1 -
 lib/THC/THCTensorTopK.cu                  | 524 +-------------------
 lib/THC/THCTensorTopK.cuh                 | 485 +++++++++++++++++++
 lib/THC/THCTensorTopK.h                   |  14 -
 lib/THC/THCTensorTypeUtils.cu             |   8 +
 lib/THC/THCTensorTypeUtils.cuh            |   2 +
 lib/THC/generic/THCTensor.c               | 137 ++++--
 lib/THC/generic/THCTensor.cu              |   2 +-
 lib/THC/generic/THCTensor.h               |   6 +-
 lib/THC/generic/THCTensorCopy.c           |   8 +-
 lib/THC/generic/THCTensorIndex.cu         |  22 +-
 lib/THC/generic/THCTensorMasked.cu        |  12 +-
 lib/THC/generic/THCTensorMath.cu          | 140 ++++--
 lib/THC/generic/THCTensorMath.h           |   8 +
 lib/THC/generic/THCTensorMathBlas.cu      | 244 +++++++++-
 lib/THC/generic/THCTensorMathBlas.h       |   3 +
 lib/THC/generic/THCTensorMathCompare.cu   |  24 +-
 lib/THC/generic/THCTensorMathCompareT.cu  |  24 +-
 lib/THC/generic/THCTensorMathMagma.cu     |  67 ++-
 lib/THC/generic/THCTensorMathPairwise.cu  | 132 ++++-
 lib/THC/generic/THCTensorMathPairwise.h   |   5 +
 lib/THC/generic/THCTensorMathPointwise.cu | 182 ++++++-
 lib/THC/generic/THCTensorMathPointwise.h  |   6 +
 lib/THC/generic/THCTensorMathReduce.cu    |  87 ++--
 lib/THC/generic/THCTensorMathReduce.h     |  16 +-
 lib/THC/generic/THCTensorMathScan.cu      |  39 +-
 lib/THC/generic/THCTensorMode.cu          | 315 ++++++++++++
 lib/THC/generic/THCTensorMode.h           |  14 +
 lib/THC/generic/THCTensorRandom.cu        |  27 +-
 lib/THC/generic/THCTensorScatterGather.cu |  12 +-
 lib/THC/generic/THCTensorSort.cu          |   4 +-
 lib/THC/generic/THCTensorTopK.cu          | 159 ++++++
 lib/THC/generic/THCTensorTopK.h           |  13 +
 rocks/cutorch-1.0-0.rockspec              |   5 +-
 rocks/cutorch-scm-1.rockspec              |   4 +-
 test/test.lua                             | 776 ++++++++++++++++++++++++++++--
 torch/utils.h                             |   2 +-
 80 files changed, 4403 insertions(+), 1230 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d1d0a0..8d3ece7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,20 @@ INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/torch")
 SET(src Storage.c init.c Tensor.c TensorMath.c TensorOperator.c torch/utils.c)
 SET(luasrc init.lua Tensor.lua FFI.lua test/test.lua)
 
+set(CMAKE_REQUIRED_INCLUDES ${LUA_INCDIR})
+include(CheckCSourceCompiles)
+check_c_source_compiles("
+#include <lauxlib.h>
+int main()
+{
+  long i = sizeof(&luaL_setfuncs);
+  return 0;
+}
+" HAS_LUAL_SETFUNCS)
+if(HAS_LUAL_SETFUNCS)
+  add_definitions(-DHAS_LUAL_SETFUNCS)
+endif()
+
 ADD_TORCH_WRAP(cudatensormathwrap TensorMath.lua)
 
 ADD_TORCH_PACKAGE(cutorch "${src}" "${luasrc}")
diff --git a/README.md b/README.md
index 3b4a174..263a131 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,7 @@ With the caching memory allocator, device allocations and frees should logically
 - `cutorch.getState()` - Returns the global state of the cutorch package. This state is not for users, it stores the raw RNG states, cublas handles and other thread and device-specific stuff.
 - `cutorch.withDevice(devID, f)` - This is a convenience for multi-GPU code, that takes in a device ID as well as a function f. It switches cutorch to the new device, executes the function f, and switches back cutorch to the original device.
 - `cutorch.createCudaHostTensor([...])` - Allocates a `torch.FloatTensor` of [host-pinned memory](https://devblogs.nvidia.com/parallelforall/how-optimize-data-transfers-cuda-cc/), where dimensions can be given as an argument list of sizes or a `torch.LongStorage`.
+- `cutorch.isCachingAllocatorEnabled()` - Returns whether the caching CUDA memory allocator is enabled or not.
 
 #### Low-level streams functions (dont use this as a user, easy to shoot yourself in the foot):
 - `cutorch.reserveStreams(n [, nonblocking])`: creates n user streams for use on every device. NOTE: stream index `s` on device 1 is a different cudaStream_t than stream `s` on device 2. Takes an optional non-blocking flag; by default, this is assumed to be false. If true, then the stream is created with cudaStreamNonBlocking.
diff --git a/TensorMath.lua b/TensorMath.lua
index 936d897..0971de0 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -661,6 +661,18 @@ for k, Tensor_ in pairs(handledTypenames) do
             {name=Tensor, method={default=1}},
             {name=real}})
 
+    wrap("lshift",
+         cname("lshift"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}})
+
+    wrap("rshift",
+         cname("rshift"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}})
+
     wrap("fmod",
          cname("fmod"),
          {{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -673,13 +685,33 @@ for k, Tensor_ in pairs(handledTypenames) do
             {name=Tensor, method={default=1}},
             {name=real}})
 
+    wrap("bitand",
+         cname("bitand"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}})
+
+    wrap("bitor",
+         cname("bitor"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}})
+
+    wrap("bitxor",
+         cname("bitxor"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}})
+
     wrap("equal",
          cname("equal"),
          {{name=Tensor},
           {name=Tensor},
           {name="boolean", creturned=true}})
 
-    for _, name in ipairs({"cmul", "cpow", "cdiv", "cremainder", "cfmod"}) do
+    local cfuncs = {"cmul", "cpow", "cdiv", "cremainder", "cfmod",
+                    "clshift", "crshift", "cbitand", "cbitor", "cbitxor"}
+    for _, name in ipairs(cfuncs) do
        wrap(name,
             cname(name),
             {{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -712,7 +744,8 @@ for k, Tensor_ in pairs(handledTypenames) do
             {{name=Tensor, default=true, returned=true},
                {name='CudaLongTensor', default=true, returned=true},
                {name=Tensor},
-               {name="index"}})
+               {name="index"},
+               {name="boolean", default=true, invisible=true}})
     end
 
     for _,name in ipairs({"cmin", "cmax"}) do
@@ -763,7 +796,8 @@ for k, Tensor_ in pairs(handledTypenames) do
          cname("sum"),
          {{name=Tensor, default=true, returned=true},
             {name=Tensor},
-            {name="index"}})
+            {name="index"},
+            {name="boolean", default=true, invisible=true}})
 
     for _, name in ipairs({"cumsum", "cumprod"}) do
         wrap(name,
@@ -780,7 +814,8 @@ for k, Tensor_ in pairs(handledTypenames) do
          cname("prod"),
          {{name=Tensor, default=true, returned=true},
             {name=Tensor},
-            {name="index"}})
+            {name="index"},
+            {name="boolean", default=true, invisible=true}})
 
     wrap("mean",
          cname("meanall"),
@@ -789,7 +824,8 @@ for k, Tensor_ in pairs(handledTypenames) do
          cname("mean"),
          {{name=Tensor, default=true, returned=true},
             {name=Tensor},
-            {name="index"}})
+            {name="index"},
+            {name="boolean", default=true, invisible=true}})
 
     wrap("maskedFill",
          cname("maskedFill"),
@@ -853,6 +889,24 @@ for k, Tensor_ in pairs(handledTypenames) do
              {name="boolean", default=0}}
     )
 
+   wrap("topk",
+        cname("topk"),
+        {{name=Tensor, default=true, returned=true},
+          {name="CudaLongTensor", default=true, returned=true, noreadadd=true},
+          {name=Tensor},
+          {name="long", default=1},
+          {name="index", default=lastdim(3)},
+          {name="boolean", default=0},
+          {name="boolean", default=0}})
+
+    wrap("mode",
+         cname("mode"),
+         {{name=Tensor, default=true, returned=true, noreadadd=true},
+             {name="CudaLongTensor", default=true, returned=true, noreadadd=true},
+             {name=Tensor},
+             {name="index", default=lastdim(3)},
+             {name="boolean", default=true, invisible=true}})
+
     wrap("squeeze",
          cname("squeeze"),
          {{name=Tensor, default=true, returned=true, postcall=function(arg)
@@ -933,6 +987,13 @@ for k, Tensor_ in pairs(handledTypenames) do
          {{name="CudaLongTensor", default=true, returned=true},
              {name=Tensor}})
 
+    wrap("range",
+         cname("range"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+             {name=accreal},
+             {name=accreal},
+             {name=accreal, default=1}})
+
     if real == 'float' or real == 'double' or real == 'half' then
        for _,name in ipairs({"log", "log1p", "exp",
                              "cos", "acos", "cosh",
@@ -949,6 +1010,20 @@ for k, Tensor_ in pairs(handledTypenames) do
 
        end
 
+       wrap("linspace",
+            cname("linspace"),
+            {{name=Tensor, default=true, returned=true, method={default='nil'}},
+                {name=real},
+                {name=real},
+                {name="long", default=100}})
+
+       wrap("logspace",
+            cname("logspace"),
+            {{name=Tensor, default=true, returned=true, method={default='nil'}},
+                {name=real},
+                {name=real},
+                {name="long", default=100}})
+
        wrap("pow",
             cname("pow"),
             {{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -1002,7 +1077,8 @@ for k, Tensor_ in pairs(handledTypenames) do
            {{name=Tensor, default=true, returned=true},
             {name=Tensor},
             {name=real},
-            {name="index"}})
+            {name="index"},
+            {name="boolean", default=true, invisible=true}})
 
       wrap("renorm",
            cname("renorm"),
@@ -1029,7 +1105,8 @@ for k, Tensor_ in pairs(handledTypenames) do
               {{name=Tensor, default=true, returned=true},
                {name=Tensor},
                {name="index"},
-               {name="boolean", default=false}})
+               {name="boolean", default=false},
+               {name="boolean", default=true, invisible=true}})
       end
 
       wrap("tril",
@@ -1055,8 +1132,6 @@ for k, Tensor_ in pairs(handledTypenames) do
            {{name=Tensor},
                {name=accreal, creturned=true}})
 
-
-
       wrap("lerp",
         cname("lerp"),
         {{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -1393,6 +1468,20 @@ wrap("zeros",
         {{name=Tensor, default=true, returned=true, method={default='nil'}},
            {name="LongArg"}})
 
+wrap("linspace",
+     cname("linspace"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=real},
+         {name=real},
+         {name="long", default=100}})
+
+wrap("logspace",
+     cname("logspace"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=real},
+         {name=real},
+         {name="long", default=100}})
+
    wrap("reshape",
         cname("reshape"),
         {{name=Tensor, default=true, returned=true},
@@ -1457,7 +1546,9 @@ wrap("equal",
       {name=Tensor},
       {name="boolean", creturned=true}})
 
-for _, name in ipairs({"cmul", "cpow", "cdiv", "cremainder", "cfmod"}) do
+local cfuncs = {"cmul", "cpow", "cdiv", "cremainder", "cfmod",
+                "clshift", "crshift", "cbitand", "cbitor", "cbitxor"}
+for _, name in ipairs(cfuncs) do
   wrap(name,
        cname(name),
        {{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -1552,6 +1643,14 @@ wrap("topk",
        {name="boolean", default=0},
        {name="boolean", default=0}})
 
+wrap("mode",
+     cname("mode"),
+     {{name=Tensor, default=true, returned=true, noreadadd=true},
+       {name="CudaLongTensor", default=true, returned=true, noreadadd=true},
+       {name=Tensor},
+       {name="index", default=lastdim(3)},
+       {name="boolean", default=true, invisible=true}})
+
 do
    local Tensor = Tensor
    local real = real
@@ -1701,7 +1800,8 @@ wrap("sum",
      cname("sum"),
      {{name=Tensor, default=true, returned=true},
         {name=Tensor},
-        {name="index"}})
+        {name="index"},
+        {name="boolean", default=true, invisible=true}})
 
 for _, name in ipairs({"cumsum", "cumprod"}) do
   wrap(name,
@@ -1718,7 +1818,8 @@ wrap("prod",
      cname("prod"),
      {{name=Tensor, default=true, returned=true},
         {name=Tensor},
-        {name="index"}})
+        {name="index"},
+        {name="boolean", default=true, invisible=true}})
 
 for _,name in ipairs({"min", "max"}) do
    wrap(name,
@@ -1729,7 +1830,8 @@ for _,name in ipairs({"min", "max"}) do
         {{name=Tensor, default=true, returned=true},
            {name='CudaLongTensor', default=true, returned=true},
            {name=Tensor},
-           {name="index"}})
+           {name="index"},
+           {name="boolean", default=true, invisible=true}})
 end
 
 for _,name in ipairs({"cmin", "cmax"}) do
@@ -1875,6 +1977,13 @@ wrap("nonzero",
      {{name="CudaLongTensor", default=true, returned=true},
          {name=Tensor}})
 
+wrap("range",
+     cname("range"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=real},
+         {name=real},
+         {name=real, default=1}})
+
 wrap("geometric",
     cname("geometric"),
     {{name=Tensor, returned=true},
@@ -2022,7 +2131,8 @@ wrap("mean",
      cname("mean"),
      {{name=Tensor, default=true, returned=true},
         {name=Tensor},
-        {name="index"}})
+        {name="index"},
+        {name="boolean", default=true, invisible=true}})
 
 for _,name in ipairs({"var", "std"}) do
    wrap(name,
@@ -2033,7 +2143,8 @@ for _,name in ipairs({"var", "std"}) do
         {{name=Tensor, default=true, returned=true},
          {name=Tensor},
          {name="index"},
-         {name="boolean", default=false}})
+         {name="boolean", default=false},
+         {name="boolean", default=true, invisible=true}})
 end
 
 wrap("norm",
@@ -2045,7 +2156,8 @@ wrap("norm",
      {{name=Tensor, default=true, returned=true},
       {name=Tensor},
       {name=real},
-      {name="index"}})
+      {name="index"},
+      {name="boolean", default=true, invisible=true}})
 
 wrap("renorm",
      cname("renorm"),
diff --git a/init.c b/init.c
index 894be2e..8b32a1a 100644
--- a/init.c
+++ b/init.c
@@ -699,6 +699,14 @@ static int cutorch_setKernelPeerToPeerAccess(lua_State *L)
   return 0;
 }
 
+static int cutorch_isCachingAllocatorEnabled(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  lua_pushboolean(L, THCState_isCachingAllocatorEnabled(state));
+
+  return 1;
+}
+
 static int cutorch_getMemoryUsage(lua_State *L) {
   size_t freeBytes = 0;
   size_t totalBytes = 0;
@@ -780,6 +788,22 @@ static int cutorch_getDeviceProperties(lua_State *L)
   return 1;
 }
 
+static int cutorch_getRuntimeVersion(lua_State *L)
+{
+  int version;
+  THCudaCheck(cudaRuntimeGetVersion(&version));
+  lua_pushnumber(L, version);
+  return 1;
+}
+
+static int cutorch_getDriverVersion(lua_State *L)
+{
+  int version;
+  THCudaCheck(cudaDriverGetVersion(&version));
+  lua_pushnumber(L, version);
+  return 1;
+}
+
 static int cutorch_seed(lua_State *L)
 {
   unsigned long long seed = THCRandom_seed(cutorch_getstate(L));
@@ -977,7 +1001,10 @@ static const struct luaL_Reg cutorch_stuff__ [] = {
   {"setPeerToPeerAccess", cutorch_setPeerToPeerAccess},
   {"setKernelPeerToPeerAccess", cutorch_setKernelPeerToPeerAccess},
   {"getKernelPeerToPeerAccess", cutorch_getKernelPeerToPeerAccess},
+  {"isCachingAllocatorEnabled", cutorch_isCachingAllocatorEnabled},
   {"getDeviceProperties", cutorch_getDeviceProperties},
+  {"getRuntimeVersion", cutorch_getRuntimeVersion},
+  {"getDriverVersion", cutorch_getDriverVersion},
   {"getMemoryUsage", cutorch_getMemoryUsage},
   {"hasHalfInstructions", cutorch_hasHalfInstructions},
   {"hasFastHalfInstructions", cutorch_hasFastHalfInstructions},
diff --git a/init.lua b/init.lua
index fdb7b08..59665c3 100644
--- a/init.lua
+++ b/init.lua
@@ -49,27 +49,23 @@ local function longTensorSize(...)
    return size
 end
 
--- Creates a FloatTensor using the CudaHostAllocator.
--- Accepts either a LongStorage or a sequence of numbers.
-function cutorch.createCudaHostTensor(...)
-   local size = longTensorSize(...)
-   local storage = torch.FloatStorage(cutorch.CudaHostAllocator, size:prod())
-   return torch.FloatTensor(storage, 1, size:storage())
+local hostTypes = {'Float', 'Double', 'Int', 'Long', 'Byte'}
+if cutorch.hasHalf then
+   table.insert(hostTypes, 'Half')
 end
 
-function cutorch.createCudaHostDoubleTensor(...)
-   local size = longTensorSize(...)
-   local storage = torch.DoubleStorage(cutorch.CudaHostAllocator, size:prod())
-   return torch.DoubleTensor(storage, 1, size:storage())
+for _, ty in ipairs(hostTypes) do
+   -- Creates torch Tensors using the CudaHostAllocator.
+   -- Accepts either a LongStorage or a sequence of numbers.
+   cutorch['createCudaHost' .. ty .. 'Tensor'] = function(...)
+      local size = longTensorSize(...)
+      local storage = torch[ty .. 'Storage'](cutorch.CudaHostAllocator, size:prod())
+      return torch[ty .. 'Tensor'](storage, 1, size:storage())
+   end
 end
 
-if cutorch.hasHalf then
-  function cutorch.createCudaHostHalfTensor(...)
-     local size = longTensorSize(...)
-     local storage = torch.HalfStorage(cutorch.CudaHostAllocator, size:prod())
-     return torch.HalfTensor(storage, 1, size:storage())
-   end
- end
+-- Alias to automate creation from both torch and cutorch types
+cutorch.createCudaHostTensor = cutorch.createCudaHostFloatTensor
 
 -- Creates a CudaTensor using the CudaUVAAllocator.
 -- Accepts either a LongStorage or a sequence of numbers.
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
index 0e08120..1ea6039 100644
--- a/lib/THC/CMakeLists.txt
+++ b/lib/THC/CMakeLists.txt
@@ -3,6 +3,7 @@ CMAKE_POLICY(VERSION 2.8)
 
 SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
 
+SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
 OPTION(NDEBUG "disable asserts (WARNING: this may result in invalid memory accesses)")
 IF(NOT NDEBUG)
   MESSAGE(STATUS "Removing -DNDEBUG from compile flags")
@@ -50,6 +51,7 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     THCTensorRandom.cpp
     THCCachingAllocator.cpp
     THCCachingHostAllocator.cpp
+    THCStream.cpp
     PROPERTIES COMPILE_FLAGS -std=${CXX_VERSION})
 ELSE()
   SET(CMAKE_CXX_STANDARD 11)
@@ -59,6 +61,10 @@ ENDIF()
 INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
 INCLUDE_DIRECTORIES("${CUDA_SDK_ROOT_DIR}/common/inc")
 
+IF ("$ENV{STATIC_TH}" STREQUAL "YES")
+LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+ENDIF()
+
 IF(MAGMA_FOUND)
   INCLUDE_DIRECTORIES(${MAGMA_INCLUDE_DIR})
   SET(CMAKE_REQUIRED_INCLUDES "${MAGMA_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}")
@@ -130,9 +136,9 @@ IF(NOT THC_INSTALL_BIN_SUBDIR
     SET(THC_INSTALL_CMAKE_SUBDIR ${Torch_INSTALL_CMAKE_SUBDIR})
   ELSE(Torch_INSTALL_BIN_SUBDIR)
     # not installing in a Torch context, so Torch_INSTALL_BIN_SUBDIR is not available
-    SET(THC_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THC install binary subdirectory")           
-    SET(THC_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THC install library subdirectory")                     
-    SET(THC_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THC install include subdirectory")                     
+    SET(THC_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THC install binary subdirectory")
+    SET(THC_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THC install library subdirectory")
+    SET(THC_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THC install include subdirectory")
     SET(THC_INSTALL_CMAKE_SUBDIR "share/cmake/THC" CACHE PATH "THC install cmake subdirectory")
   ENDIF(Torch_INSTALL_BIN_SUBDIR)
 
@@ -153,7 +159,7 @@ SET(src
     THCCachingHostAllocator.cpp
     THCGeneral.c
     THCStorageCopy.c
-    THCStream.c
+    THCStream.cpp
     THCTensor.c
     THCTensorCopy.c
     THCTensorRandom.cpp
@@ -182,6 +188,8 @@ SET(src-cuda
   THCTensorTopK.cu
   THCTensorSort.cu
   THCTensorTypeUtils.cu
+  THCSortUtils.cu
+  THCTensorMode.cu
   )
 
 # loop over all types
@@ -208,28 +216,33 @@ ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
 ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
 
 MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
+IF ("$ENV{STATIC_TH}" STREQUAL "YES")
+  CUDA_ADD_LIBRARY(THC STATIC ${src} ${src-cuda})
+  SET_TARGET_PROPERTIES(THC PROPERTIES COMPILE_FLAGS "-fPIC")
+ELSE()
+  CUDA_ADD_LIBRARY(THC SHARED ${src} ${src-cuda})
+  CUDA_ADD_CUBLAS_TO_TARGET(THC)
+  TARGET_LINK_LIBRARIES(THC ${TH_LIBRARIES} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY})
+
+  IF(USE_MAGMA)
+    TARGET_LINK_LIBRARIES(THC ${MAGMA_LIBRARIES})
+  ENDIF(USE_MAGMA)
+
+  IF(NOT THC_SO_VERSION)
+    SET(THC_SO_VERSION 0)
+  ENDIF(NOT THC_SO_VERSION)
+  MESSAGE(STATUS "THC_SO_VERSION: ${THC_SO_VERSION}")
+  SET_TARGET_PROPERTIES(THC PROPERTIES
+    VERSION   ${THC_SO_VERSION}
+    SOVERSION ${THC_SO_VERSION})
+
+
+  INSTALL(TARGETS THC
+    RUNTIME DESTINATION "${THC_INSTALL_BIN_SUBDIR}"
+    LIBRARY DESTINATION "${THC_INSTALL_LIB_SUBDIR}"
+    ARCHIVE DESTINATION "${THC_INSTALL_LIB_SUBDIR}")
+ENDIF()
 
-CUDA_ADD_LIBRARY(THC SHARED ${src} ${src-cuda})
-CUDA_ADD_CUBLAS_TO_TARGET(THC)
-TARGET_LINK_LIBRARIES(THC ${TH_LIBRARIES} ${CUDA_curand_LIBRARY})
-
-IF(USE_MAGMA)
-  TARGET_LINK_LIBRARIES(THC ${MAGMA_LIBRARIES} ${CUDA_cusparse_LIBRARY})
-ENDIF(USE_MAGMA)
-
-IF(NOT THC_SO_VERSION)
-  SET(THC_SO_VERSION 0)
-ENDIF(NOT THC_SO_VERSION)
-MESSAGE(STATUS "THC_SO_VERSION: ${THC_SO_VERSION}")
-SET_TARGET_PROPERTIES(THC PROPERTIES
-  VERSION   ${THC_SO_VERSION}
-  SOVERSION ${THC_SO_VERSION})
-
-
-INSTALL(TARGETS THC
-          RUNTIME DESTINATION "${THC_INSTALL_BIN_SUBDIR}"
-          LIBRARY DESTINATION "${THC_INSTALL_LIB_SUBDIR}"
-          ARCHIVE DESTINATION "${THC_INSTALL_LIB_SUBDIR}")
 
 INSTALL(FILES
           THC.h
@@ -245,7 +258,6 @@ INSTALL(FILES
           THCTensorRandom.h
           THCTensorMath.h
           THCTensorConv.h
-          THCTensorTopK.h
           THCApply.cuh
           THCReduce.cuh
           THCReduceAll.cuh
@@ -276,10 +288,13 @@ INSTALL(FILES
           THCNumerics.cuh
           THCTensorSort.cuh
           THCTensorInfo.cuh
+          THCTensorMathPointwise.cuh
           THCTensorTypeUtils.cuh
           THCTensorRandom.cuh
           THCTensorMathMagma.cuh
           THCThrustAllocator.cuh
+          THCTensorMode.cuh
+          THCTensorTopK.cuh
           DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC")
 
 INSTALL(FILES
@@ -324,4 +339,8 @@ INSTALL(FILES
           generic/THCDeviceTensorUtils.cu
           generic/THCTensorRandom.h
           generic/THCTensorRandom.cu
+          generic/THCTensorMode.h
+          generic/THCTensorMode.cu
+          generic/THCTensorTopK.h
+          generic/THCTensorTopK.cu
           DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC/generic")
diff --git a/lib/THC/THC.h b/lib/THC/THC.h
index e3840dc..90a3a53 100644
--- a/lib/THC/THC.h
+++ b/lib/THC/THC.h
@@ -15,6 +15,5 @@
 #include "THCTensorRandom.h"
 #include "THCTensorMath.h"
 #include "THCTensorConv.h"
-#include "THCTensorTopK.h"
 
 #endif
diff --git a/lib/THC/THCAsmUtils.cuh b/lib/THC/THCAsmUtils.cuh
index 7015d20..f0dc90b 100644
--- a/lib/THC/THCAsmUtils.cuh
+++ b/lib/THC/THCAsmUtils.cuh
@@ -3,20 +3,44 @@
 
 // Collection of direct PTX functions
 
-__device__ __forceinline__
-unsigned int getBitfield(unsigned int val, int pos, int len) {
-  unsigned int ret;
-  asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
-  return ret;
-}
+template <typename T>
+struct Bitfield {};
 
-__device__ __forceinline__
-unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
-  unsigned int ret;
-  asm("bfi.b32 %0, %1, %2, %3, %4;" :
-      "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
-  return ret;
-}
+template <>
+struct Bitfield<unsigned int> {
+  static __device__ __forceinline__
+  unsigned int getBitfield(unsigned int val, int pos, int len) {
+    unsigned int ret;
+    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+
+  static __device__ __forceinline__
+  unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
+    unsigned int ret;
+    asm("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+};
+
+template <>
+struct Bitfield<unsigned long long int> {
+  static __device__ __forceinline__
+  unsigned long long int getBitfield(unsigned long long int val, int pos, int len) {
+    unsigned long long int ret;
+    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+
+  static __device__ __forceinline__
+  unsigned long long int setBitfield(unsigned long long int val, unsigned long long int toInsert, int pos, int len) {
+    unsigned long long int ret;
+    asm("bfi.b64 %0, %1, %2, %3, %4;" :
+        "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+};
 
 __device__ __forceinline__ int getLaneId() {
   int laneId;
diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh
index ac0b45f..7a0be48 100644
--- a/lib/THC/THCAtomics.cuh
+++ b/lib/THC/THCAtomics.cuh
@@ -2,6 +2,7 @@
 #define THC_ATOMICS_INC
 
 #include "THCHalf.h"
+#include "THCNumerics.cuh"
 
 template <typename T, size_t n>
 struct AtomicAddIntegerImpl;
diff --git a/lib/THC/THCBlas.cu b/lib/THC/THCBlas.cu
index c438ad8..9db4f0b 100644
--- a/lib/THC/THCBlas.cu
+++ b/lib/THC/THCBlas.cu
@@ -389,6 +389,39 @@ void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot,
   THCublasCheck(cublasDgetrfBatched(handle, n, a, lda, pivot, info, batchSize));
 }
 
+THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize)
+{
+  if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Dgetrs only supports n, nrhs, lda, ldb, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  // no need to adjust leading dimensions, since matrices are square
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasSgetrsBatched(handle, opa, n, nrhs, a, lda, pivot, b, ldb, info, batchSize));
+}
+
+
+THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize)
+{
+  if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Dgetrs only supports n, nrhs, lda, ldb, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  // no need to adjust leading dimensions, since matrices are square
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasDgetrsBatched(handle, opa, n, nrhs, a, lda, pivot, b, ldb, info, batchSize));
+}
+
 void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) {
 
   if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) )
diff --git a/lib/THC/THCBlas.h b/lib/THC/THCBlas.h
index bf91f93..25246b1 100644
--- a/lib/THC/THCBlas.h
+++ b/lib/THC/THCBlas.h
@@ -35,6 +35,10 @@ THC_API void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb,
 /* Inverse */
 THC_API void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize);
 THC_API void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize);
+
+THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize);
+THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize);
+
 THC_API void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize);
 THC_API void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize);
 
diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp
index eeae04a..11d1467 100644
--- a/lib/THC/THCCachingAllocator.cpp
+++ b/lib/THC/THCCachingAllocator.cpp
@@ -1,6 +1,7 @@
 #include "THCCachingAllocator.h"
 
 #include <cuda_runtime_api.h>
+#include <deque>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -17,7 +18,7 @@
 //   split. If no block is found, the allocator will delegate to cudaMalloc.
 // - If the cudaMalloc fails, the allocator will free all cached blocks that
 //   are not split and retry the allocation.
-// - Large (>1MB) and small allocation requestss are handled separately. Large
+// - Large (>1MB) and small allocation requests are handled separately. Large
 //   allocation requests can be filled by a cudaMalloc call of the exact size.
 //   Small requests will allocate and split a 1MB buffer, if necessary.
 //
@@ -26,26 +27,36 @@
 // launches. The programmer must insert the proper synchronization if memory
 // segments are used from multiple streams.
 //
+// The library provides a recordStream() function to help insert the correct
+// synchronization when allocations are used on multiple streams. This will
+// ensure that the block is not reused before each recorded stream completes
+// work.
+//
 
 
 namespace {
 
+typedef std::shared_ptr<THCStream> THCStreamPtr;
+typedef std::set<THCStreamPtr> stream_set;
+
 const size_t kRoundSmall = 512;     // round up small allocs to 512 bytes
 const size_t kRoundLarge = 131072;  // round up large allocs to 128 KiB
 const size_t kSmallAlloc = 1048576; // largest "small" allocation is 1 MiB
 
 struct Block {
-  int           device;     // gpu
-  cudaStream_t  stream;     // allocation stream
-  size_t        size;       // block size in bytes
-  char*         ptr;        // memory address
-  bool          allocated;  // in-use flag
-  Block*        prev;       // prev block if split from a larger allocation
-  Block*        next;       // next block if split from a larger allocation
+  int           device;      // gpu
+  cudaStream_t  stream;      // allocation stream
+  stream_set    stream_uses; // streams on which the block was used
+  size_t        size;        // block size in bytes
+  char*         ptr;         // memory address
+  bool          allocated;   // in-use flag
+  Block*        prev;        // prev block if split from a larger allocation
+  Block*        next;        // next block if split from a larger allocation
+  int           event_count; // number of outstanding CUDA events
 
   Block(int device, cudaStream_t stream, size_t size, char* ptr=NULL) :
-      device(device), stream(stream), size(size), ptr(ptr), allocated(0),
-      prev(NULL), next(NULL) { }
+      device(device), stream(stream), stream_uses(), size(size), ptr(ptr),
+      allocated(0), prev(NULL), next(NULL), event_count(0) { }
 };
 
 static bool BlockComparator(const Block* a, const Block* b)
@@ -69,9 +80,12 @@ struct THCCachingAllocator
   typedef bool (*Comparison)(const Block*, const Block*);
   typedef std::set<Block*, Comparison> FreeBlocks;
 
-  // lock around malloc and free
+  // lock around all operations
   std::mutex mutex;
 
+  // lock around calls to cudaFree (to prevent deadlocks with NCCL)
+  std::mutex cuda_free_mutex;
+
   // cached blocks larger than 1 MB
   FreeBlocks large_blocks;
 
@@ -81,6 +95,9 @@ struct THCCachingAllocator
   // allocated blocks by device pointer
   std::unordered_map<void*, Block*> allocated_blocks;
 
+  // outstanding cuda events
+  std::deque<std::pair<cudaEvent_t, Block*>> cuda_events;
+
   THCCachingAllocator() :
       large_blocks(BlockComparator),
       small_blocks(BlockComparator) {}
@@ -96,6 +113,11 @@ struct THCCachingAllocator
       return err;
     }
 
+    err = process_events();
+    if (err != cudaSuccess) {
+      return err;
+    }
+
     size = round_size(size);
     bool small = size <= kSmallAlloc;
 
@@ -156,15 +178,13 @@ struct THCCachingAllocator
 
     Block* block = it->second;
     allocated_blocks.erase(it);
-
-    bool small = block->size <= kSmallAlloc;
-    auto& free_blocks = small ? large_blocks : small_blocks;
-    try_merge_blocks(block, block->prev, free_blocks);
-    try_merge_blocks(block, block->next, free_blocks);
-
     block->allocated = false;
-    free_blocks.insert(block);
 
+    if (!block->stream_uses.empty()) {
+      return insert_events(block);
+    }
+
+    free_block(block);
     return cudaSuccess;
   }
 
@@ -226,10 +246,37 @@ struct THCCachingAllocator
     cacheInfoAux(small_blocks, dev_id, total, largest);
   }
 
+  void recordStream(void* ptr, THCStream* stream)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    Block* block = find_allocated_block(ptr);
+    if (!block) {
+      THError("invalid device pointer: %p", ptr);
+    }
+    if (stream->stream == block->stream) {
+      // ignore uses on the allocation stream, since those don't require any
+      // special synchronization
+      return;
+    }
+    THCStream_retain(stream);
+    block->stream_uses.insert(THCStreamPtr(stream, &THCStream_free));
+  }
+
+  /** moves a block into the free block list */
+  void free_block(Block* block)
+  {
+    THAssert(!block->allocated && block->event_count == 0);
+    bool small = block->size <= kSmallAlloc;
+    auto& free_blocks = small ? large_blocks : small_blocks;
+    try_merge_blocks(block, block->prev, free_blocks);
+    try_merge_blocks(block, block->next, free_blocks);
+    free_blocks.insert(block);
+  }
+
   /** combine previously split blocks */
   void try_merge_blocks(Block* dst, Block* src, FreeBlocks& free_blocks)
   {
-    if (!src || src->allocated) {
+    if (!src || src->allocated || src->event_count > 0) {
       return;
     }
     if (dst->prev == src) {
@@ -303,6 +350,7 @@ struct THCCachingAllocator
   cudaError_t free_blocks(FreeBlocks& blocks, FreeBlocks::iterator it, FreeBlocks::iterator end)
   {
     // Frees all non-split blocks between `it` and `end`
+    std::lock_guard<std::mutex> lock(cuda_free_mutex);
     while (it != end) {
       Block* block = *it;
       if (!block->prev && !block->next) {
@@ -328,6 +376,69 @@ struct THCCachingAllocator
     }
     return it->second;
   }
+
+  cudaError_t insert_events(Block* block)
+  {
+    cudaError_t err;
+
+    int prev_device;
+    err = cudaGetDevice(&prev_device);
+    if (err != cudaSuccess) return err;
+
+    std::set<THCStreamPtr> streams(std::move(block->stream_uses));
+    THAssert(block->stream_uses.empty());
+    for (auto it = streams.begin(); it != streams.end(); ++it) {
+      auto& stream = *it;
+
+      err = cudaSetDevice(stream->device);
+      if (err != cudaSuccess) break;
+
+      cudaEvent_t event;
+      err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+      if (err != cudaSuccess) break;
+
+      err = cudaEventRecord(event, stream->stream);
+      if (err != cudaSuccess) break;
+
+      block->event_count++;
+      cuda_events.emplace_back(event, block);
+    }
+
+    cudaSetDevice(prev_device);
+    return err;
+  }
+
+  cudaError_t process_events()
+  {
+    // Process outstanding cudaEvents. Events that are completed are removed
+    // from the queue, and the 'event_count' for the corresponding allocation
+    // is decremented. Stops at the first event which has not been completed.
+    // Since events on different devices or streams may occur out of order,
+    // the processing of some events may be delayed.
+    while (!cuda_events.empty()) {
+      auto& e = cuda_events.front();
+      cudaEvent_t event = e.first;
+      Block* block = e.second;
+
+      cudaError_t err = cudaEventQuery(event);
+      if (err == cudaErrorNotReady) {
+        break;
+      } else if (err != cudaSuccess) {
+        return err;
+      }
+      err = cudaEventDestroy(event);
+      if (err != cudaSuccess) {
+        return err;
+      }
+
+      block->event_count--;
+      if (block->event_count == 0) {
+        free_block(block);
+      }
+      cuda_events.pop_front();
+    }
+    return cudaSuccess;
+  }
 };
 
 static cudaError_t THCCachingAllocator_malloc(void* ctx, void** ptr, size_t size, cudaStream_t stream)
@@ -374,3 +485,13 @@ THC_API void* THCCachingAllocator_getBaseAllocation(void *ptr, size_t *size)
 {
   return caching_allocator.getBaseAllocation(ptr, size);
 }
+
+THC_API void THCCachingAllocator_recordStream(void *ptr, THCStream* stream)
+{
+  caching_allocator.recordStream(ptr, stream);
+}
+
+THC_API std::mutex* THCCachingAllocator_getCudaFreeMutex()
+{
+  return &caching_allocator.cuda_free_mutex;
+}
diff --git a/lib/THC/THCCachingAllocator.h b/lib/THC/THCCachingAllocator.h
index 3eb3725..fbf9109 100644
--- a/lib/THC/THCCachingAllocator.h
+++ b/lib/THC/THCCachingAllocator.h
@@ -1,9 +1,19 @@
 #ifndef THC_DEVICE_ALLOCATOR_INC
 #define THC_DEVICE_ALLOCATOR_INC
 
+#if __cplusplus >= 201103L
+#include <mutex>
+#endif
+
 #include "THCGeneral.h"
+#include "THCStream.h"
 
 THC_API THCDeviceAllocator* THCCachingAllocator_get(void);
 THC_API void* THCCachingAllocator_getBaseAllocation(void *ptr, size_t *size);
+THC_API void THCCachingAllocator_recordStream(void *ptr, THCStream* stream);
+
+#if __cplusplus >= 201103L
+THC_API std::mutex* THCCachingAllocator_getCudaFreeMutex();
+#endif
 
 #endif
diff --git a/lib/THC/THCCachingHostAllocator.cpp b/lib/THC/THCCachingHostAllocator.cpp
index 3cbbccb..a43cb30 100644
--- a/lib/THC/THCCachingHostAllocator.cpp
+++ b/lib/THC/THCCachingHostAllocator.cpp
@@ -2,6 +2,7 @@
 
 #include <cuda_runtime_api.h>
 #include <deque>
+#include <memory>
 #include <mutex>
 #include <set>
 #include <stdint.h>
@@ -11,6 +12,8 @@
 
 namespace {
 
+typedef std::shared_ptr<THCStream> THCStreamPtr;
+
 struct BlockSize
 {
   size_t  size; // allocation size
@@ -23,9 +26,10 @@ struct Block : public BlockSize
 {
   bool  allocated;    // true if the block is currently allocated
   int   event_count;  // number of outstanding cuda events
+  std::set<THCStreamPtr> streams;
 
   Block(size_t size, void* ptr, bool allocated) :
-      BlockSize(size, ptr), allocated(allocated), event_count(0) { }
+      BlockSize(size, ptr), allocated(allocated), event_count(0), streams() {}
 };
 
 static bool BlockComparator(const BlockSize& a, const BlockSize& b)
@@ -98,13 +102,28 @@ struct HostAllocator
       return cudaSuccess;
     }
 
+    // process outstanding cuda events which may have occurred
+    cudaError_t err = processEvents();
+    if (err != cudaSuccess) {
+      return err;
+    }
+
     auto it = blocks.find(ptr);
     THAssert(it != blocks.end());
 
     Block& block = it->second;
     THAssert(block.allocated);
 
+    // free (on valid memory) shouldn't fail, so mark unallocated before
+    // we process the streams.
     block.allocated = false;
+
+    // insert CUDA events for each stream on which this block was used. This
+    err = insertEvents(block);
+    if (err != cudaSuccess) {
+      return err;
+    }
+
     if (block.event_count == 0) {
       // the block can be re-used if there are no outstanding cuda events
       available.insert(block);
@@ -112,7 +131,7 @@ struct HostAllocator
     return cudaSuccess;
   }
 
-  cudaError_t recordEvent(void* ptr, cudaStream_t stream)
+  cudaError_t recordEvent(void* ptr, THCStream *stream)
   {
     std::lock_guard<std::mutex> lock(mutex);
     cudaError_t err;
@@ -126,26 +145,10 @@ struct HostAllocator
     Block& block = it->second;
     THAssert(block.allocated);
 
-    // process outstanding cuda events which may have occurred
-    err = processEvents();
-    if (err != cudaSuccess) {
-      return err;
-    }
+    THCStreamPtr stream_ptr(stream, &THCStream_free);
+    THCStream_retain(stream);
 
-    // create and record an event in the given stream
-    cudaEvent_t event;
-    err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
-    if (err != cudaSuccess) {
-      return err;
-    }
-    err = cudaEventRecord(event, stream);
-    if (err != cudaSuccess) {
-      return err;
-    }
-
-    // the block will not be re-used until all associated events have occured
-    block.event_count++;
-    cuda_events.emplace_back(event, ptr);
+    block.streams.insert(std::move(stream_ptr));
     return cudaSuccess;
   }
 
@@ -186,18 +189,17 @@ struct HostAllocator
     std::lock_guard<std::mutex> lock(mutex);
 
     // remove events for freed blocks
-    std::deque<std::pair<cudaEvent_t, void*>> new_events;
     for (auto it = cuda_events.begin(); it != cuda_events.end(); ++it) {
       cudaEvent_t event = it->first;
       Block& block = blocks.at(it->second);
       if (!block.allocated) {
         THCudaCheckWarn(cudaEventDestroy(event));
         block.event_count--;
-      } else {
-        new_events.push_back(*it);
       }
     }
-    cuda_events.swap(new_events);
+
+    // all cuda_events have been processed
+    cuda_events.clear();
 
     // clear list of available blocks
     available.clear();
@@ -213,6 +215,36 @@ struct HostAllocator
       }
     }
   }
+
+  cudaError_t insertEvents(Block& block)
+  {
+    cudaError_t err;
+
+    int prev_device;
+    err = cudaGetDevice(&prev_device);
+    if (err != cudaSuccess) return err;
+
+    std::set<THCStreamPtr> streams(std::move(block.streams));
+    for (auto it = streams.begin(); it != streams.end(); ++it) {
+      auto& stream = *it;
+
+      err = cudaSetDevice(stream->device);
+      if (err != cudaSuccess) break;
+
+      cudaEvent_t event;
+      err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+      if (err != cudaSuccess) break;
+
+      err = cudaEventRecord(event, stream->stream);
+      if (err != cudaSuccess) break;
+
+      block.event_count++;
+      cuda_events.emplace_back(event, block.ptr);
+    }
+
+    cudaSetDevice(prev_device);
+    return err;
+  }
 };
 
 }  // namespace
@@ -232,7 +264,7 @@ static void THCCachingHostAllocator_free(void* ctx, void* ptr)
   allocator.free(ptr);
 }
 
-cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, cudaStream_t stream)
+cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream)
 {
   return allocator.recordEvent(ptr, stream);
 }
diff --git a/lib/THC/THCCachingHostAllocator.h b/lib/THC/THCCachingHostAllocator.h
index a695565..05513ac 100644
--- a/lib/THC/THCCachingHostAllocator.h
+++ b/lib/THC/THCCachingHostAllocator.h
@@ -2,6 +2,7 @@
 #define THC_CACHING_HOST_ALLOCATOR_INC
 
 #include "THCGeneral.h"
+#include "THCStream.h"
 
 //
 // A caching allocator for CUDA host allocations (pinned memory).
@@ -22,7 +23,7 @@ THC_API THAllocator THCCachingHostAllocator;
 
 // Records an event in the specified stream. The allocation 'ptr' will not be
 // re-used until the event has occured.
-THC_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, cudaStream_t stream);
+THC_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream);
 
 // Releases cached pinned memory allocations via cudaHostFree
 THC_API void THCCachingHostAllocator_emptyCache(void);
diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c
index c442bd8..e99487e 100644
--- a/lib/THC/THCGeneral.c
+++ b/lib/THC/THCGeneral.c
@@ -75,6 +75,7 @@ void THCudaInit(THCState* state)
     state->currentStreams[i] = THCThreadLocal_alloc();
   }
   state->currentPerDeviceBlasHandle = THCThreadLocal_alloc();
+  state->currentPerDeviceSparseHandle = THCThreadLocal_alloc();
 
   state->resourcesPerDevice = (THCCudaResourcesPerDevice*)
     malloc(numDevices * sizeof(THCCudaResourcesPerDevice));
@@ -107,9 +108,9 @@ void THCudaInit(THCState* state)
     THCudaCheck(cudaSetDevice(i));
     THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i));
 
-    // Allocate space for the NULL stream
+    // Allocate space for the default stream
     res->streams = (THCStream**) malloc(sizeof(THCStream*));
-    res->streams[0] = NULL;
+    res->streams[0] = THCStream_defaultStream(i);
 
     /* The scratch space that we want to have available per each device is
        based on the number of SMs available per device. We guarantee a
@@ -131,6 +132,7 @@ void THCudaInit(THCState* state)
   // cuBLAS handle is the first user BLAS handle. Note that the actual BLAS
   // handles are created lazily.
   state->numUserBlasHandles = 1;
+  state->numUserSparseHandles = 1;
 
   state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically
   state->heapDelta = 0;
@@ -158,14 +160,18 @@ void THCudaShutdown(THCState* state)
   for (int dev = 0; dev < deviceCount; ++dev) {
     THCudaCheck(cudaSetDevice(dev));
     THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]);
-    /* Free user reserved streams (0 is the default stream) */
-    for (int i = 1; i <= state->numUserStreams; ++i) {
+    /* Free all streams */
+    for (int i = 0; i <= state->numUserStreams; ++i) {
       THCStream_free(res->streams[i]);
     }
     /* Free user defined BLAS handles */
     for (int i = 0; i < res->numBlasHandles; ++i) {
       THCublasCheck(cublasDestroy(res->blasHandles[i]));
     }
+    /* Free user defined sparse handles */
+    for (int i = 0; i < res->numSparseHandles; ++i) {
+      THCusparseCheck(cusparseDestroy(res->sparseHandles[i]));
+    }
     /* Free per-stream scratch space; starts at 0 because there is space for
        the default stream as well*/
     if (res->devScratchSpacePerStream) {
@@ -176,6 +182,7 @@ void THCudaShutdown(THCState* state)
 
     free(res->streams);
     free(res->blasHandles);
+    free(res->sparseHandles);
     free(res->devScratchSpacePerStream);
     THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev]));
     THCThreadLocal_free(state->currentStreams[dev]);
@@ -293,11 +300,20 @@ THAllocator* THCState_getCudaUVAAllocator(THCState* state)
   return state->cudaUVAAllocator;
 }
 
+THC_API THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state)
+{
+  return state->cudaDeviceAllocator;
+}
+
 void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator)
 {
   state->cudaDeviceAllocator = allocator;
 }
 
+int THCState_isCachingAllocatorEnabled(THCState* state) {
+  return state->cudaHostAllocator == &THCCachingHostAllocator;
+}
+
 int THCState_getNumDevices(THCState *state)
 {
   return state->numDevices;
@@ -383,6 +399,29 @@ void THCState_reserveDeviceBlasHandles(THCState* state, int device, int numBlasH
   THCudaCheck(cudaSetDevice(prevDev));
 }
 
+void THCState_reserveDeviceSparseHandles(THCState* state, int device, int numSparseHandles)
+{
+  int prevDev = -1;
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+  if (numSparseHandles <= res->numSparseHandles) {
+    return;
+  }
+
+  THCudaCheck(cudaGetDevice(&prevDev));
+  THCudaCheck(cudaSetDevice(device));
+
+  size_t size = numSparseHandles * sizeof(cusparseHandle_t);
+  cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size);
+  for (int i = res->numSparseHandles; i < numSparseHandles; ++i) {
+    handles[i] = NULL;
+    THCusparseCheck(cusparseCreate(&handles[i]));
+  }
+  res->sparseHandles = handles;
+  res->numSparseHandles = numSparseHandles;
+
+  THCudaCheck(cudaSetDevice(prevDev));
+}
+
 void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
 {
   // cuBLAS handles are created lazily from THCState_getDeviceBlasHandle
@@ -393,6 +432,16 @@ void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
   }
 }
 
+void THCState_reserveSparseHandles(THCState* state, int numSparseHandles)
+{
+  // cuBLAS handles are created lazily from THCState_getDeviceSparseHandle
+  // to avoid initializing unused devices
+  if (numSparseHandles > state->numUserSparseHandles)
+  {
+    state->numUserSparseHandles = numSparseHandles;
+  }
+}
+
 int THCState_getNumStreams(THCState* state)
 {
   return state->numUserStreams;
@@ -403,6 +452,11 @@ int THCState_getNumBlasHandles(THCState* state)
   return state->numUserBlasHandles;
 }
 
+int THCState_getNumSparseHandles(THCState* state)
+{
+  return state->numUserSparseHandles;
+}
+
 THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
   THCState *state, int device)
 {
@@ -423,7 +477,7 @@ cudaStream_t THCState_getDeviceStream(THCState *state, int device, int streamInd
   }
   THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
   THCStream* stream = res->streams[streamIndex];
-  return stream ? stream->stream : NULL;
+  return stream->stream;
 }
 
 cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle)
@@ -437,20 +491,37 @@ cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int han
   return res->blasHandles[handle - 1];
 }
 
+cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle)
+{
+  if (handle <= 0 || handle > state->numUserSparseHandles) {
+    THError("%d is not a valid handle, valid range is: (1, %d)",
+            handle, state->numUserSparseHandles);
+  }
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+  THCState_reserveDeviceSparseHandles(state, device, handle);
+  return res->sparseHandles[handle - 1];
+}
+
 static THCStream* THCState_getStreamOnDevice(THCState* state, int device)
 {
-  return (THCStream*) THCThreadLocal_get(state->currentStreams[device]);
+  THCThreadLocal local = state->currentStreams[device];
+  THCStream* stream = (THCStream*)THCThreadLocal_get(local);
+  if (!stream) {
+    stream = THCStream_defaultStream(device);
+    THCStream_retain(stream);
+    THCThreadLocal_set(local, stream);
+  }
+  return stream;
 }
 
 static void THCState_setStreamOnDevice(THCState *state, int device, THCStream *stream)
 {
-  if (stream) {
-    if (stream->device != device) {
-      THError("invalid stream; expected stream for device %d, but was on %d",
-          device, stream->device);
-    }
-    THCStream_retain(stream);
+  THAssert(stream);
+  if (stream->device != device) {
+    THError("invalid stream; expected stream for device %d, but was on %d",
+        device, stream->device);
   }
+  THCStream_retain(stream);
   THCThreadLocal local = state->currentStreams[device];
   THCStream_free((THCStream*)THCThreadLocal_get(local));
   THCThreadLocal_set(local, stream);
@@ -459,7 +530,8 @@ static void THCState_setStreamOnDevice(THCState *state, int device, THCStream *s
 cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device)
 {
   THCStream* stream = THCState_getStreamOnDevice(state, device);
-  return stream ? stream->stream : NULL;
+  THAssert(stream);
+  return stream->stream;
 }
 
 cudaStream_t THCState_getCurrentStream(THCState *state)
@@ -493,12 +565,25 @@ cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
   return NULL;
 }
 
+cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state)
+{
+  /* This is called at the point of kernel execution.
+     For some debugging code or improperly instrumented kernels,
+     `state` is null */
+  if (state) {
+    int device;
+    THCudaCheck(cudaGetDevice(&device));
+
+    int handle = THCState_getCurrentSparseHandleIndex(state);
+    return THCState_getDeviceSparseHandle(state, device, handle);
+  }
+  THError("THCState and sparseHandles must be set as there is no default sparseHandle");
+  return NULL;
+}
+
 int THCState_getCurrentStreamIndex(THCState *state)
 {
   THCStream* stream = THCState_getStream(state);
-  if (!stream) {
-    return 0;
-  }
 
   int device;
   THCudaCheck(cudaGetDevice(&device));
@@ -521,6 +606,15 @@ int THCState_getCurrentBlasHandleIndex(THCState *state)
   return (int) (intptr_t) value;
 }
 
+int THCState_getCurrentSparseHandleIndex(THCState *state)
+{
+  void* value = THCThreadLocal_get(state->currentPerDeviceSparseHandle);
+  if (value == NULL) {
+    return 1;
+  }
+  return (int) (intptr_t) value;
+}
+
 THCStream* THCState_getStream(THCState *state)
 {
   int device;
@@ -544,13 +638,8 @@ void THCState_setCurrentStreamIndex(THCState *state, int streamIndex)
 
   int device;
   for (device = 0; device < state->numDevices; ++device) {
-    THCStream* stream = NULL;
-    if (streamIndex != 0) {
-      THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
-      stream = res->streams[streamIndex];
-    }
-
-    THCState_setStreamOnDevice(state, device, stream);
+    THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+    THCState_setStreamOnDevice(state, device, res->streams[streamIndex]);
   }
 }
 
@@ -564,6 +653,16 @@ void THCState_setCurrentBlasHandleIndex(THCState *state, int handle)
   THCThreadLocal_set(state->currentPerDeviceBlasHandle, (void*)(intptr_t)handle);
 }
 
+void THCState_setCurrentSparseHandleIndex(THCState *state, int handle)
+{
+  if (handle > state->numUserSparseHandles || handle <= 0)
+  {
+    THError("%d is not a valid handle, valid range is: (1, %d)",
+            handle, state->numUserSparseHandles);
+  }
+  THCThreadLocal_set(state->currentPerDeviceSparseHandle, (void*)(intptr_t)handle);
+}
+
 void* THCState_getCurrentDeviceScratchSpace(THCState* state)
 {
   int device = -1;
@@ -668,6 +767,55 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
   }
 }
 
+void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line)
+{
+  if(status != CUSPARSE_STATUS_SUCCESS)
+  {
+    const char* errmsg = NULL;
+
+    switch(status)
+    {
+      case CUSPARSE_STATUS_NOT_INITIALIZED:
+        errmsg = "library not initialized";
+        break;
+
+      case CUSPARSE_STATUS_ALLOC_FAILED:
+        errmsg = "resource allocation failed";
+        break;
+
+      case CUSPARSE_STATUS_INVALID_VALUE:
+        errmsg = "an invalid numeric value was used as an argument";
+        break;
+
+      case CUSPARSE_STATUS_ARCH_MISMATCH:
+        errmsg = "an absent device architectural feature is required";
+        break;
+
+      case CUSPARSE_STATUS_MAPPING_ERROR:
+        errmsg = "an access to GPU memory space failed";
+        break;
+
+      case CUSPARSE_STATUS_EXECUTION_FAILED:
+        errmsg = "the GPU program failed to execute";
+        break;
+
+      case CUSPARSE_STATUS_INTERNAL_ERROR:
+        errmsg = "an internal operation failed";
+        break;
+
+      case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+        errmsg = "the matrix type is not supported by this function";
+        break;
+
+      default:
+        errmsg = "unknown error";
+        break;
+    }
+
+    _THError(file, line, "cusparse runtime error : %s", errmsg);
+  }
+}
+
 static ptrdiff_t heapSize = 0; // not thread-local
 static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6;
 static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6;
@@ -700,6 +848,27 @@ cudaError_t THCudaFree(THCState *state, void *ptr)
   return allocator->free(allocator->state, ptr);
 }
 
+void* THCudaHostAlloc(THCState *state, size_t size)
+{
+  THCudaCheck(cudaGetLastError());
+  THAllocator* allocator = state->cudaHostAllocator;
+  return allocator->malloc(NULL, size);
+}
+
+void THCudaHostFree(THCState *state, void *ptr)
+{
+  THAllocator* allocator = state->cudaHostAllocator;
+  return allocator->free(NULL, ptr);
+}
+
+void THCudaHostRecord(THCState *state, void *ptr)
+{
+  if (state->cudaHostAllocator == &THCCachingHostAllocator) {
+    THCStream* stream = THCState_getStream(state);
+    THCCachingHostAllocator_recordEvent(ptr, stream);
+  }
+}
+
 cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalBytes)
 {
   size_t cachedBytes = 0;
@@ -768,3 +937,19 @@ void THCHeapUpdate(THCState *state, ptrdiff_t size) {
 
 #include "THCStorage.c"
 #include "THCAllocator.c"
+
+/* from THCHalf.h */
+
+half THC_float2half(float f)
+{
+  half h;
+  TH_float2halfbits(&f, &h.x);
+  return h;
+}
+
+float  THC_half2float(half h)
+{
+  float f;
+  TH_halfbits2float(&h.x, &f);
+  return f;
+}
diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in
index a88bd7d..f33446d 100644
--- a/lib/THC/THCGeneral.h.in
+++ b/lib/THC/THCGeneral.h.in
@@ -9,6 +9,7 @@
 #include "cuda.h"
 #include "cuda_runtime.h"
 #include "cublas_v2.h"
+#include "cusparse.h"
 
 #cmakedefine USE_MAGMA
 
@@ -57,8 +58,12 @@ typedef struct _THCCudaResourcesPerDevice {
   THCStream** streams;
   /* Number of materialized cuBLAS handles */
   int numBlasHandles;
+  /* Number of materialized cuSparse handles */
+  int numSparseHandles;
   /* cuBLAS handes are lazily initialized */
   cublasHandle_t* blasHandles;
+  /* cuSparse handes are lazily initialized */
+  cusparseHandle_t* sparseHandles;
   /* Size of scratch space per each stream on this device available */
   size_t scratchSpacePerStream;
   /* Device-resident scratch space per stream, used for global memory
@@ -72,9 +77,9 @@ struct THCState {
   struct THCRNGState* rngState;
   struct cudaDeviceProp* deviceProperties;
   /* Set of all allocated resources. resourcePerDevice[dev]->streams[0] is NULL,
-     which specifies the per-device default stream. blasHandles do not have a
-     default and must be explicitly initialized. We always initialize 1
-     blasHandle but we can use more.
+     which specifies the per-device default stream. blasHandles and
+     sparseHandles do not have a default and must be explicitly initialized.
+     We always initialize 1 blasHandle and 1 sparseHandle but we can use more.
   */
   THCCudaResourcesPerDevice* resourcesPerDevice;
   /* Captured number of devices upon startup; convenience for bounds checking */
@@ -82,6 +87,7 @@ struct THCState {
   /* Number of Torch defined resources available, indices 1 ... numStreams */
   int numUserStreams;
   int numUserBlasHandles;
+  int numUserSparseHandles;
 
   /* Allocator using cudaMallocHost. */
   THAllocator* cudaHostAllocator;
@@ -91,6 +97,9 @@ struct THCState {
   /* Index of the current selected BLAS handle. The actual BLAS handle used
      depends on the current device. */
   THCThreadLocal/*<int>*/ currentPerDeviceBlasHandle;
+  /* Index of the current selected sparse handle. The actual sparse handle used
+     depends on the current device. */
+  THCThreadLocal/*<int>*/ currentPerDeviceSparseHandle;
   /* Array of thread locals containing the current stream for each device */
   THCThreadLocal* currentStreams;
 
@@ -139,7 +148,9 @@ THC_API struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* sta
 THC_API struct THCRNGState* THCState_getRngState(THCState* state);
 THC_API THAllocator* THCState_getCudaHostAllocator(THCState* state);
 THC_API THAllocator* THCState_getCudaUVAAllocator(THCState* state);
+THC_API THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state);
 THC_API void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator);
+THC_API int THCState_isCachingAllocatorEnabled(THCState* state);
 
 THC_API void THCMagma_init(THCState *state);
 
@@ -161,27 +172,42 @@ THC_API void THCState_setCurrentStreamIndex(THCState *state, int stream);
 THC_API void THCState_reserveBlasHandles(THCState* state, int numHandles);
 THC_API int THCState_getNumBlasHandles(THCState* state);
 
+THC_API void THCState_reserveSparseHandles(THCState* state, int numHandles);
+THC_API int THCState_getNumSparseHandles(THCState* state);
+
 THC_API cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle);
 THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state);
 THC_API int THCState_getCurrentBlasHandleIndex(THCState *state);
 THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle);
 
+THC_API cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle);
+THC_API cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state);
+THC_API int THCState_getCurrentSparseHandleIndex(THCState *state);
+THC_API void THCState_setCurrentSparseHandleIndex(THCState *state, int handle);
+
 /* For the current device and stream, returns the allocated scratch space */
 THC_API void* THCState_getCurrentDeviceScratchSpace(THCState* state);
 THC_API void* THCState_getDeviceScratchSpace(THCState* state, int device, int stream);
 THC_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
 THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device);
 
+#define THCAssertSameGPU(expr) if (!expr) THError("arguments are located on different GPUs")
 #define THCudaCheck(err)  __THCudaCheck(err, __FILE__, __LINE__)
 #define THCudaCheckWarn(err)  __THCudaCheckWarn(err, __FILE__, __LINE__)
 #define THCublasCheck(err)  __THCublasCheck(err,  __FILE__, __LINE__)
+#define THCusparseCheck(err)  __THCusparseCheck(err,  __FILE__, __LINE__)
 
 THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
 THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
 THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
+THC_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
 
 THC_API cudaError_t THCudaMalloc(THCState *state, void **ptr, size_t size);
 THC_API cudaError_t THCudaFree(THCState *state, void *ptr);
+THC_API void* THCudaHostAlloc(THCState *state, size_t size);
+THC_API void THCudaHostFree(THCState *state, void *ptr);
+THC_API void THCudaHostRecord(THCState *state, void *ptr);
+
 THC_API cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes);
 THC_API void THCSetGCHandler(THCState *state,
                              void (*torchGCHandlerFunction)(void *data),
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index 023774e..7863260 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -33,96 +33,6 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
     in, in + len, out, __half2floatOp());
 }
 
-// FixMe: could call TH_half2float
-// and convert types here, but maybe slower?
-float THC_half2float(half h)
-{
-  unsigned sign = ((h.x >> 15) & 1);
-  unsigned exponent = ((h.x >> 10) & 0x1f);
-  unsigned mantissa = ((h.x & 0x3ff) << 13);
-
-  if (exponent == 0x1f) {  /* NaN or Inf */
-    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
-    exponent = 0xff;
-  } else if (!exponent) {  /* Denorm or Zero */
-    if (mantissa) {
-      unsigned int msb;
-      exponent = 0x71;
-      do {
-        msb = (mantissa & 0x400000);
-        mantissa <<= 1;  /* normalize */
-        --exponent;
-      } while (!msb);
-      mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
-    }
-  } else {
-    exponent += 0x70;
-  }
-
-  int temp = ((sign << 31) | (exponent << 23) | mantissa);
-
-  float x;
-  memcpy(&x,&temp,sizeof(float));
-  return x;
-}
-
-half THC_float2half(float f)
-{
-  half ret;
-
-  unsigned x;
-  memcpy(&x,&f,sizeof(f));
-  unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
-  unsigned sign, exponent, mantissa;
-
-  // Get rid of +NaN/-NaN case first.
-  if (u > 0x7f800000) {
-    ret.x = 0x7fffU;
-    return ret;
-  }
-
-  sign = ((x >> 16) & 0x8000);
-
-  // Get rid of +Inf/-Inf, +0/-0.
-  if (u > 0x477fefff) {
-    ret.x = sign | 0x7c00U;
-    return ret;
-  }
-  if (u < 0x33000001) {
-    ret.x = (sign | 0x0000);
-    return ret;
-  }
-
-  exponent = ((u >> 23) & 0xff);
-  mantissa = (u & 0x7fffff);
-
-  if (exponent > 0x70) {
-    shift = 13;
-    exponent -= 0x70;
-  } else {
-    shift = 0x7e - exponent;
-    exponent = 0;
-    mantissa |= 0x800000;
-  }
-  lsb = (1 << shift);
-  lsb_s1 = (lsb >> 1);
-  lsb_m1 = (lsb - 1);
-
-  // Round to nearest even.
-  remainder = (mantissa & lsb_m1);
-  mantissa >>= shift;
-  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
-    ++mantissa;
-    if (!(mantissa & 0x3ff)) {
-      ++exponent;
-      mantissa = 0;
-    }
-  }
-
-  ret.x = (sign | (exponent << 10) | mantissa);
-  return ret;
-}
-
 THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) {
   cudaDeviceProp* prop =
     THCState_getCurrentDeviceProperties(state);
diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh
index 0944360..b6d1dac 100644
--- a/lib/THC/THCNumerics.cuh
+++ b/lib/THC/THCNumerics.cuh
@@ -48,7 +48,7 @@ struct THCNumerics<char> {
   static inline __host__ __device__  char mul(char a, char b) { return a * b; }
   static inline __host__ __device__  char sub(char a, char b) { return a - b; }
   static inline __host__ __device__  char div(char a, char b) { return a / b; }
-  static inline __host__ __device__  char abs(char a) { return abs(a); }
+  static inline __host__ __device__  char abs(char a) { return ::abs((int)a); }
 };
 
 template <>
@@ -67,7 +67,7 @@ struct THCNumerics<short> {
   static inline __host__ __device__  short mul(short a, short b) { return a * b; }
   static inline __host__ __device__  short sub(short a, short b) { return a - b; }
   static inline __host__ __device__  short div(short a, short b) { return a / b; }
-  static inline __host__ __device__  short abs(short a) { return abs(a); }
+  static inline __host__ __device__  short abs(short a) { return ::abs((int)a); }
 };
 
 template <>
@@ -211,6 +211,19 @@ struct THCNumerics<half> {
 #endif
   }
 
+  static inline __host__ __device__ half exp10(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hexp10(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(exp10f(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(exp10f(THC_half2float(a)));
+#endif
+  }
+
   static inline __host__ __device__ half log(half a) {
 #ifdef __CUDA_ARCH__
 #ifdef CUDA_HALF_INSTRUCTIONS
@@ -233,6 +246,15 @@ struct THCNumerics<half> {
 #endif
   }
 
+static inline __host__ __device__ half lgamma(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(lgammaf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(lgammaf(THC_half2float(a)));
+#endif
+  }
+
   static inline __host__ __device__ half cos(half a) {
 #ifdef __CUDA_ARCH__
 #ifdef CUDA_HALF_INSTRUCTIONS
@@ -514,7 +536,9 @@ struct THCNumerics<float> {
   static inline __host__ __device__ bool eq(float a, float b) { return a == b; }
   static inline __host__ __device__ bool ne(float a, float b) { return a != b; }
 
+  static inline __host__ __device__  float lgamma(float a) { return lgammaf(a);}
   static inline __host__ __device__  float exp  (float a) { return   expf(a); }
+  static inline __host__ __device__  float exp10(float a) { return exp10f(a); }
   static inline __host__ __device__  float log  (float a) { return   logf(a); }
   static inline __host__ __device__  float log1p(float a) { return log1pf(a); }
   static inline __host__ __device__  float cos  (float a) { return   cosf(a); }
@@ -557,7 +581,9 @@ struct THCNumerics<double> {
   static inline __host__ __device__ bool eq(double a, double b) { return a == b; }
   static inline __host__ __device__ bool ne(double a, double b) { return a != b; }
 
+  static inline __host__ __device__  double lgamma(double a) { return ::lgamma(a);}
   static inline __host__ __device__  double exp  (double a) { return   ::exp(a); }
+  static inline __host__ __device__  double exp10(double a) { return ::exp10(a); }
   static inline __host__ __device__  double log  (double a) { return   ::log(a); }
   static inline __host__ __device__  double log1p(double a) { return ::log1p(a); }
   static inline __host__ __device__  double cos  (double a) { return   ::cos(a); }
diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh
index 7f276a2..067d796 100644
--- a/lib/THC/THCReduce.cuh
+++ b/lib/THC/THCReduce.cuh
@@ -168,7 +168,8 @@ bool THC_reduceDim(THCState* state,
                    const ModifyOp& modifyOp,
                    const ReduceOp& reduceOp,
                    typename TensorUtils<TensorType>::DataType init,
-                   int dim) {
+                   int dim,
+                   int keepdim) {
   ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);
 
   long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim);
@@ -315,6 +316,10 @@ bool THC_reduceDim(THCState* state,
 #undef HANDLE_IN_CASE
 #undef HANDLE_OUT_CASE
 
+
+  if (!keepdim) {
+    TensorUtils<TensorType>::squeeze1d(state, out, out, dim);
+  }
   return true;
 }
 
diff --git a/lib/THC/THCReduceAll.cuh b/lib/THC/THCReduceAll.cuh
index 9a335c7..1d04e63 100644
--- a/lib/THC/THCReduceAll.cuh
+++ b/lib/THC/THCReduceAll.cuh
@@ -331,7 +331,7 @@ bool THC_reduceAll(THCState* state,
   // If our destination is not on the device, copy the value back to
   // the host (synchronous!)
   if (!outOnDevice) {
-    cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost);
+    THCudaCheck(cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost));
   }
 
   if (freeDevOut) {
diff --git a/lib/THC/THCReduceApplyUtils.cuh b/lib/THC/THCReduceApplyUtils.cuh
index e365b3a..30325de 100644
--- a/lib/THC/THCReduceApplyUtils.cuh
+++ b/lib/THC/THCReduceApplyUtils.cuh
@@ -19,57 +19,113 @@ __device__ __forceinline__ IndexType getLinearBlockId() {
     blockIdx.x;
 }
 
-// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will
-// return the reduced value
-template <typename T, typename ReduceOp>
-__device__ T reduceBlock(T* smem,
-                         int numVals,
-                         T threadVal,
-                         ReduceOp reduceOp,
-                         T init) {
+// Reduce N values concurrently, i.e. suppose N = 2, and there are 4 threads:
+// (1, 2), (3, 4), (5, 6), (7, 8), then the return in threadVals for thread 0
+// is (1 + 3 + 5 + 7, 2 + 4 + 6 + 8) = (16, 20)
+template <typename T, typename ReduceOp, int N>
+__device__ void reduceNValuesInBlock(T *smem,
+                             T threadVals[N],
+                             int numVals,
+                             ReduceOp reduceOp,
+                             T init) {
   if (numVals == 0) {
-    return init;
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      threadVals[i] = init;
+    }
+    return;
   }
 
+  // We store each of the N values contiguously, so if N = 2, all values for
+  // the first threadVal for each thread in the block are stored followed by
+  // all of the values for the second threadVal for each thread in the block
   if (threadIdx.x < numVals) {
-    smem[threadIdx.x] = threadVal;
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      smem[i * numVals + threadIdx.x] = threadVals[i];
+    }
   }
-
-  // First warp will perform reductions across warps
   __syncthreads();
-  if ((threadIdx.x / warpSize) == 0) {
-    T r = threadIdx.x < numVals ? smem[threadIdx.x] : init;
+
+  // Number of lanes in the final reduction --> this is used to determine
+  // where to put the outputs of each of the n things we are reducing. If
+  // nLP = 32, then we have the 32 outputs for the first threadVal,
+  // followed by the 32 outputs for the second threadVal, etc.
+  int numLanesParticipating = min(numVals, warpSize);
+
+  if (numVals > warpSize && ((threadIdx.x / warpSize) == 0 )) {
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      threadVals[i] = threadIdx.x < numVals ? threadVals[i] : init;
+    }
 
     for (int i = warpSize + threadIdx.x; i < numVals; i += warpSize) {
-      r = reduceOp(r, smem[i]);
+#pragma unroll
+      for (int j = 0; j < N; ++j) {
+        threadVals[j] = reduceOp(threadVals[j], smem[j * numVals + i]);
+      }
     }
 
-    smem[threadIdx.x] = r;
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      smem[i * numLanesParticipating + threadIdx.x] = threadVals[i];
+    }
   }
-
-  // First thread will perform reductions across the block
   __syncthreads();
 
-  T r = init;
   if (threadIdx.x == 0) {
-    r = smem[0];
-
-    int numLanesParticipating = min(numVals, warpSize);
-
     if (numLanesParticipating == 32) {
-      // Unroll for warpSize == 32 and numVals >= 32
 #pragma unroll
-      for (int i = 1; i < 32; ++i) {
-        r = reduceOp(r, smem[i]);
+      for (int i = 0; i < N; ++i) {
+#pragma unroll
+        for (int j = 1; j < 32; ++j) {
+          threadVals[i] = reduceOp(threadVals[i], smem[i * 32 + j]);
+        }
       }
     } else {
-      for (int i = 1; i < numLanesParticipating; ++i) {
-        r = reduceOp(r, smem[i]);
+#pragma unroll
+      for (int i = 0; i < N; ++i) {
+        for (int j = 1; j < numLanesParticipating; ++j) {
+          threadVals[i] = reduceOp(threadVals[i], smem[i * numVals + j]);
+        }
       }
     }
   }
+}
+
+// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will
+// return the reduced value
+template <typename T, typename ReduceOp>
+__device__ T reduceBlock(T* smem,
+                         int numVals,
+                         T threadVal,
+                         ReduceOp reduceOp,
+                         T init) {
+  reduceNValuesInBlock<T, ReduceOp, 1>(smem, &threadVal, numVals, reduceOp, init);
+  return threadVal;
+}
+
+
+// Block-wide reduction where each thread locally reduces N
+// values before letting a single warp take over - assumes
+// threadVals is in registers, not shared memory
+template <typename T, typename ReduceOp, int N>
+__device__ T reduceBlockWithNThreadLocalReductions(T *smem,
+                         T threadVals[N],
+                         int numVals,
+                         ReduceOp reduceOp,
+                         T init) {
+  int offset = threadIdx.x * N;
+  T local = offset < numVals ? threadVals[0] : init;
+
+#pragma unroll
+  for (int i = 1; i < N; ++i) {
+    ++offset;
+    T next = offset < numVals ? threadVals[i] : init;
+    local = reduceOp(local, next);
+  }
 
-  return r;
+  return reduceBlock<T, ReduceOp>(smem, blockDim.x < numVals ? blockDim.x : numVals, local, reduceOp, init);
 }
 
 // Make sure the given tensor doesn't have too many dimensions
diff --git a/lib/THC/THCScanUtils.cuh b/lib/THC/THCScanUtils.cuh
index 41a4423..ccf27b7 100644
--- a/lib/THC/THCScanUtils.cuh
+++ b/lib/THC/THCScanUtils.cuh
@@ -5,9 +5,103 @@
 
 // Collection of in-kernel scan / prefix sum utilities
 
+// Inclusive Scan via an upsweep/downsweep mechanism. Assumes:
+//
+// 1. Power2ScanSize is a power of 2. This code still works for collections that
+// do not exactly contain a power of 2 number of elements, simply round up to the
+// nearest power of 2 and then call.
+//
+// 2. That there are two-elements per thread, i.e. the size of the smem storage
+// is 2 * blockDim.x * sizeof(T).
+//
+// Consider a (+)-Scan on the following elements:
+//
+// Upsweep:
+//
+//    0  1  2  3  4  5  6  7
+//       1     5     9    13
+//             6          22
+//                        28
+//
+// Downsweep:
+//                  15
+//         3     10    21
+template <typename T, class BinaryOp, int Power2ScanSize>
+__device__ void inclusivePrefixScan(T *smem, BinaryOp binop) {
+  // Reduce step ("upsweep")
+#pragma unroll
+  for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if (index < Power2ScanSize) {
+      smem[index] = binop(smem[index], smem[index - stride]);
+    }
+    __syncthreads();
+  }
+
+  // Post-reduce step ("downsweep")
+#pragma unroll
+  for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if ((index + stride) < Power2ScanSize) {
+      smem[index + stride] = binop(smem[index + stride], smem[index]);
+    }
+    __syncthreads();
+  }
+}
+
+// Generic Op that can be be used to support segmented scans by re-using
+// the basic inclusiveScanOp. Merely requires that the input data has both
+// a flag and val component
+template <typename T, class BinaryOp>
+struct SegmentedScanOp {
+  __host__ __device__ SegmentedScanOp(BinaryOp binop): _binop(binop) {}
+  __host__ __device__ inline T operator()(const T& a, const T& b) {
+    T c;
+    c.val = a.flag ? a.val : _binop(a.val, b.val);
+    c.flag = a.flag | b.flag;
+    return c;
+  }
+
+  BinaryOp _binop;
+};
+
+// Extends the above Inclusive Scan to support segments. It has the same properties
+// but also takes a flag array that indicates the starts of "segments", i.e. individual
+// units to scan. For example, consider the following (+)-scan that is segmented:
+//
+// Input:  [1, 3, 2, 4, 1, 2, 3, 2, 1, 4]
+// Flags:  [1, 0, 0, 1, 0, 1, 1, 0, 1, 0]
+// Output:  1  4  6  4  5  2  3  5  1  5
+//
+// So we see that each "flag" resets the scan to that index.
+template <typename T, class BinaryOp, int Power2ScanSize>
+__device__ void segmentedInclusivePrefixScan(T *smem, bool *bmem, BinaryOp binop) {
+  // Reduce step ("upsweep")
+#pragma unroll
+  for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if (index < Power2ScanSize) {
+      smem[index] = bmem[index] ? smem[index] : binop(smem[index], smem[index - stride]);
+      bmem[index] = bmem[index] | bmem[index - stride];
+    }
+    __syncthreads();
+  }
+
+  // Post-reduce step ("downsweep")
+#pragma unroll
+  for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if ((index + stride) < Power2ScanSize) {
+      smem[index + stride] = bmem[index + stride] ? smem[index + stride] : binop(smem[index + stride], smem[index]);
+      bmem[index + stride] = bmem[index + stride] | bmem[index];
+    }
+    __syncthreads();
+  }
+}
+
 // Inclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency>
-__device__ void inclusivePrefixSum(T* smem, T in, T* out) {
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) {
   // FIXME: this is a slow, simple implementation; need up/down sweep,
   // prevent smem conflicts
   smem[threadIdx.x] = in;
@@ -18,7 +112,7 @@ __device__ void inclusivePrefixSum(T* smem, T in, T* out) {
     T val = 0;
 
     if (threadIdx.x >= offset) {
-      val = smem[threadIdx.x - offset] + smem[threadIdx.x];
+      val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]);
     }
 
     __syncthreads();
@@ -38,11 +132,11 @@ __device__ void inclusivePrefixSum(T* smem, T in, T* out) {
 }
 
 // Exclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency>
-__device__ void exclusivePrefixSum(T* smem, T in, T* out, T* carry) {
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) {
   // FIXME: crappy implementation
   // We kill write-after-read dependencies separately below, hence the `false`
-  inclusivePrefixSum<T, false>(smem, in, out);
+  inclusivePrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
 
   *out -= in;
   *carry = smem[blockDim.x - 1];
@@ -55,8 +149,8 @@ __device__ void exclusivePrefixSum(T* smem, T in, T* out, T* carry) {
 
 // Inclusive prefix sum for binary vars using intra-warp voting +
 // shared memory
-template <typename T, bool KillWARDependency>
-__device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
   // Within-warp, we use warp voting.
   T vote = __ballot(in);
   T index = __popc(getLaneMaskLe() & vote);
@@ -77,8 +171,8 @@ __device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
     int current = 0;
     for (int i = 0; i < blockDim.x / 32; ++i) {
       T v = smem[i];
-      smem[i] += current;
-      current += v;
+      smem[i] = binop(smem[i], current);
+      current = binop(current, v);
     }
   }
 
@@ -86,7 +180,7 @@ __device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
 
   // load the carry from the preceding warp
   if (warp >= 1) {
-    index += smem[warp - 1];
+    index = binop(index, smem[warp - 1]);
   }
 
   *out = index;
@@ -98,9 +192,9 @@ __device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
 
 // Exclusive prefix sum for binary vars using intra-warp voting +
 // shared memory
-template <typename T, bool KillWARDependency>
-__device__ void exclusiveBinaryPrefixSum(T* smem, bool in, T* out, T* carry) {
-  inclusiveBinaryPrefixSum<T, false>(smem, in, out);
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, BinaryFunction binop) {
+  inclusiveBinaryPrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
 
   // Inclusive to exclusive
   *out -= (T) in;
diff --git a/lib/THC/THCSortUtils.cu b/lib/THC/THCSortUtils.cu
new file mode 100644
index 0000000..3c4c0f9
--- /dev/null
+++ b/lib/THC/THCSortUtils.cu
@@ -0,0 +1,17 @@
+#include "THCSortUtils.cuh"
+
+// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
+unsigned long nextHighestPowerOf2(unsigned long n) {
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+#ifndef _MSC_VER
+  n |= n >> 32;
+#endif
+  n++;
+
+  return n;
+}
diff --git a/lib/THC/THCSortUtils.cuh b/lib/THC/THCSortUtils.cuh
index ec676c0..d5ad237 100644
--- a/lib/THC/THCSortUtils.cuh
+++ b/lib/THC/THCSortUtils.cuh
@@ -41,6 +41,18 @@ __device__ inline void bitonicSwap(K& kA, V& vA, bool& validA,
   }
 };
 
+template <typename Comparator, typename K>
+__device__ inline void bitonicSwapKeys(K& kA, bool& validA,
+                                       K& kB, bool& validB,
+                                       bool dir,
+                                       const Comparator& comp) {
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(validA, validB);
+  }
+}
+
 template <typename Comparator, typename K, typename V,
           typename IndexType, int Power2SortSize>
 __device__ inline void bitonicSort(K keys[Power2SortSize],
@@ -53,12 +65,9 @@ __device__ inline void bitonicSort(K keys[Power2SortSize],
 
 #pragma unroll
     for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
-
-      // Single warp per slice is completely synchronous
-      if (Power2SortSize > 64) {
-        __syncthreads();
-      }
-
+    
+      __syncthreads();
+      
       unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
       bitonicSwap<Comparator, K, V>(
         keys[pos], values[pos], valid[pos],
@@ -69,11 +78,9 @@ __device__ inline void bitonicSort(K keys[Power2SortSize],
 
 #pragma unroll
   for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
-    // Single warp per slice is completely synchronous
-    if (Power2SortSize > 64) {
-      __syncthreads();
-    }
-
+    
+    __syncthreads();
+    
     unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
     bitonicSwap<Comparator, K, V>(
       keys[pos], values[pos], valid[pos],
@@ -81,10 +88,45 @@ __device__ inline void bitonicSort(K keys[Power2SortSize],
       false, comp);
   }
 
-  // Single warp per slice is completely synchronous
-  if (Power2SortSize > 64) {
+  __syncthreads();
+  
+}
+
+template <typename Comparator, typename K,
+          typename IndexType, int Power2SortSize>
+__device__ inline void bitonicSortKeys(K keys[Power2SortSize],
+                                   bool valid[Power2SortSize],
+                                   const Comparator& comp) {
+#pragma unroll
+  for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
+    bool flag = ((threadIdx.x & (size / 2)) != 0);
+
+#pragma unroll
+    for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
+
+      __syncthreads();
+      
+      unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      bitonicSwapKeys<Comparator, K>(
+        keys[pos], valid[pos],
+        keys[pos + stride], valid[pos + stride],
+        flag, comp);
+    }
+  }
+
+#pragma unroll
+  for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
     __syncthreads();
+    
+    unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    bitonicSwapKeys<Comparator, K>(
+      keys[pos], valid[pos],
+      keys[pos + stride], valid[pos + stride],
+      false, comp);
   }
+
+  __syncthreads();
+  
 }
 
 // Sorts (key, value) pairs (in different tensors) in-place; i.e.,
@@ -168,4 +210,6 @@ bitonicSortKVInPlace(TensorInfo<K, IndexType> keys,
   }
 }
 
+unsigned long nextHighestPowerOf2(unsigned long n);
+
 #endif // THC_SORT_UTILS_INC
diff --git a/lib/THC/THCStream.c b/lib/THC/THCStream.c
deleted file mode 100644
index e261a51..0000000
--- a/lib/THC/THCStream.c
+++ /dev/null
@@ -1,30 +0,0 @@
-#include "THCStream.h"
-
-#include <cuda_runtime_api.h>
-#include "THAtomic.h"
-
-
-THCStream* THCStream_new(int flags)
-{
-  THCStream* self = (THCStream*) malloc(sizeof(THCStream));
-  self->refcount = 1;
-  THCudaCheck(cudaGetDevice(&self->device));
-  THCudaCheck(cudaStreamCreateWithFlags(&self->stream, flags));
-  return self;
-}
-
-void THCStream_free(THCStream* self)
-{
-  if (!self) {
-    return;
-  }
-  if (THAtomicDecrementRef(&self->refcount)) {
-    THCudaCheck(cudaStreamDestroy(self->stream));
-    free(self);
-  }
-}
-
-void THCStream_retain(THCStream* self)
-{
-  THAtomicIncrementRef(&self->refcount);
-}
diff --git a/lib/THC/THCStream.cpp b/lib/THC/THCStream.cpp
new file mode 100644
index 0000000..49fe680
--- /dev/null
+++ b/lib/THC/THCStream.cpp
@@ -0,0 +1,60 @@
+#include "THCStream.h"
+
+#include <mutex>
+#include <cuda_runtime_api.h>
+#include "THAtomic.h"
+
+#define MAX_DEVICES 256
+static THCStream default_streams[MAX_DEVICES];
+
+static void initialize_default_streams()
+{
+  for (int i = 0; i < MAX_DEVICES; i++) {
+    default_streams[i].device = i;
+  }
+}
+
+THCStream* THCStream_new(int flags)
+{
+  THCStream* self = (THCStream*) malloc(sizeof(THCStream));
+  self->refcount = 1;
+  THCudaCheck(cudaGetDevice(&self->device));
+  THCudaCheck(cudaStreamCreateWithFlags(&self->stream, flags));
+  return self;
+}
+
+THC_API THCStream* THCStream_defaultStream(int device)
+{
+  // default streams aren't refcounted
+  THAssert(device >= 0 && device < MAX_DEVICES);
+  std::once_flag once;
+  std::call_once(once, &initialize_default_streams);
+  return &default_streams[device];
+}
+
+THCStream* THCStream_newWithPriority(int flags, int priority)
+{
+  THCStream* self = (THCStream*) malloc(sizeof(THCStream));
+  self->refcount = 1;
+  THCudaCheck(cudaGetDevice(&self->device));
+  THCudaCheck(cudaStreamCreateWithPriority(&self->stream, flags, priority));
+  return self;
+}
+
+void THCStream_free(THCStream* self)
+{
+  if (!self || !self->stream) {
+    return;
+  }
+  if (THAtomicDecrementRef(&self->refcount)) {
+    THCudaCheckWarn(cudaStreamDestroy(self->stream));
+    free(self);
+  }
+}
+
+void THCStream_retain(THCStream* self)
+{
+  if (self->stream) {
+    THAtomicIncrementRef(&self->refcount);
+  }
+}
diff --git a/lib/THC/THCStream.h b/lib/THC/THCStream.h
index de3f64e..6ccb057 100644
--- a/lib/THC/THCStream.h
+++ b/lib/THC/THCStream.h
@@ -13,6 +13,8 @@ struct THCStream
 
 
 THC_API THCStream* THCStream_new(int flags);
+THC_API THCStream* THCStream_defaultStream(int device);
+THC_API THCStream* THCStream_newWithPriority(int flags, int priority);
 THC_API void THCStream_free(THCStream* self);
 THC_API void THCStream_retain(THCStream* self);
 
diff --git a/lib/THC/THCTensorConv.cu b/lib/THC/THCTensorConv.cu
index 71aac03..c8c1ad6 100644
--- a/lib/THC/THCTensorConv.cu
+++ b/lib/THC/THCTensorConv.cu
@@ -296,7 +296,7 @@ __global__ void conv2genericrev(float *input, float *kernel, float *output,
 THC_API void THCudaTensor_conv2Dmv(THCState *state, THCudaTensor *output, float beta, THCudaTensor *input,
                                    THCudaTensor *kernel, long srow, long scol, const char *type)
 {
-  THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel));
+  THCAssertSameGPU(THCudaTensor_checkGPU(state, 3, output, input, kernel));
   long nInputPlane, nInputRows, nInputCols;
   long nKernelRows, nKernelCols;
   long nOutputPlane, nOutputRows, nOutputCols;
@@ -416,7 +416,7 @@ THC_API void THCudaTensor_conv2Dmv(THCState *state, THCudaTensor *output, float
 THC_API void THCudaTensor_conv2Dmm(THCState *state, THCudaTensor *output, float beta, THCudaTensor *input,
                                    THCudaTensor *kernel, long srow, long scol, const char *type)
 {
-  THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel));
+  THCAssertSameGPU(THCudaTensor_checkGPU(state, 3, output, input, kernel));
   long nbatch, nInputPlane, nInputRows, nInputCols;
   long nKernelRows, nKernelCols;
   long nOutputPlane, nOutputRows, nOutputCols;
@@ -549,7 +549,7 @@ THC_API void THCudaTensor_conv2DRevger(THCState *state, THCudaTensor *output, fl
                                        THCudaTensor *input, THCudaTensor *kernel,
                                        long srow, long scol)
 {
-  THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel));
+  THCAssertSameGPU(THCudaTensor_checkGPU(state, 3, output, input, kernel));
   long nInputPlane, nInputRows, nInputCols;
   long nKernelPlane, nKernelRows, nKernelCols;
   long nOutputRows, nOutputCols;
@@ -883,7 +883,7 @@ THC_API void THCudaTensor_conv2Dmap(THCState *state, THCudaTensor *output, THCud
                                     THCudaTensor *kernel, long stride_x, long stride_y,
                                     THCudaTensor *table, long fanin)
 {
-  THAssert(THCudaTensor_checkGPU(state, 4, output, input, kernel, table));
+  THCAssertSameGPU(THCudaTensor_checkGPU(state, 4, output, input, kernel, table));
   long nInputPlane, nInputRows, nInputCols;
   long nKernelRows, nKernelCols;
   long nOutputPlane, nOutputRows, nOutputCols;
diff --git a/lib/THC/THCTensorCopy.h b/lib/THC/THCTensorCopy.h
index e8bc4f4..74f2b59 100644
--- a/lib/THC/THCTensorCopy.h
+++ b/lib/THC/THCTensorCopy.h
@@ -4,6 +4,7 @@
 #include "THCTensor.h"
 #include "THCGeneral.h"
 #include "THCHalf.h"
+#include "THCStream.h"
 
 #include "generic/THCTensorCopy.h"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMath.cu b/lib/THC/THCTensorMath.cu
index 41e6466..b9225fe 100644
--- a/lib/THC/THCTensorMath.cu
+++ b/lib/THC/THCTensorMath.cu
@@ -107,6 +107,32 @@ struct NonZeroOp
   }
 };
 
+template<typename T, typename accT = T>
+struct LinspaceOp {
+  __host__ __device__ LinspaceOp(accT start, accT step): 
+    start_(start), step_(step) { }
+  __device__ __forceinline__ T operator()(ptrdiff_t index) {
+    accT increment = THCNumerics<accT>::mul(step_, ScalarConvert<ptrdiff_t,accT>::to(index));
+    accT value = THCNumerics<accT>::add(start_, increment);
+    return ScalarConvert<accT,T>::to(value);
+  }
+
+  const accT start_, step_;
+};
+
+template<typename T, typename accT = T>
+struct LogspaceOp {
+  __host__ __device__ LogspaceOp(accT start, accT step): 
+    start_(start), step_(step) { }
+  __device__ __forceinline__ T operator()(ptrdiff_t index) {
+    accT increment = THCNumerics<accT>::mul(step_, ScalarConvert<ptrdiff_t,accT>::to(index));
+    accT value = THCNumerics<accT>::exp10(THCNumerics<accT>::add(start_, increment));
+    return ScalarConvert<accT,T>::to(value);
+  }
+
+  const accT start_, step_;
+};
+
 
 #include "generic/THCTensorMath.cu"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h
index 19ae679..b888672 100644
--- a/lib/THC/THCTensorMath.h
+++ b/lib/THC/THCTensorMath.h
@@ -43,6 +43,12 @@
 #include "generic/THCTensorSort.h"
 #include "THCGenerateAllTypes.h"
 
+#include "generic/THCTensorMode.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorTopK.h"
+#include "THCGenerateAllTypes.h"
+
 THC_API int THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self);
 THC_API int THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self);
 
diff --git a/lib/THC/THCTensorMath2.cu b/lib/THC/THCTensorMath2.cu
index 7e6af9b..aaee332 100644
--- a/lib/THC/THCTensorMath2.cu
+++ b/lib/THC/THCTensorMath2.cu
@@ -16,7 +16,7 @@ struct TensorATan2Op {
 
 void THCudaTensor_atan2(THCState *state, THCudaTensor *self_, THCudaTensor *tx, THCudaTensor *ty)
 {
-  THAssert(THCudaTensor_checkGPU(state, 3, self_, tx, ty));
+  THCAssertSameGPU(THCudaTensor_checkGPU(state, 3, self_, tx, ty));
   THArgCheck(THCudaTensor_nElement(state, tx) ==
              THCudaTensor_nElement(state, ty), 3, "sizes do not match");
   THCudaTensor_resizeAs(state, self_, tx);
diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu
index 094cf0b..efefd76 100644
--- a/lib/THC/THCTensorMathPairwise.cu
+++ b/lib/THC/THCTensorMathPairwise.cu
@@ -244,11 +244,17 @@ template <typename T>
 struct TensorRemainderOp {
   TensorRemainderOp(T v) : val(v) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *in - val * (*in / val);
+    *out = *in % val;
+    if ((*out * val) < 0){
+      *out += val;
+    }
   }
 
   __device__ __forceinline__ void operator()(T* v) {
-    *v = *v - val * (*v / val);
+    *v = *v % val;
+    if ((*v * val) < 0){
+      *v += val;
+    }
   }
 
   const T val;
@@ -399,5 +405,75 @@ struct TensorTriOp {
   const long stride0, stride1, k;
 };
 
+template <typename T>
+struct TensorLShiftConstantOp {
+  TensorLShiftConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in << val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v <<= val;
+  }
+
+  const T val;
+};
+
+template <typename T>
+struct TensorRShiftConstantOp {
+  TensorRShiftConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in >> val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v >>= val;
+  }
+
+  const T val;
+};
+
+template <typename T>
+struct TensorBitAndConstantOp {
+  TensorBitAndConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in & val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v &= val;
+  }
+
+  const T val;
+};
+
+template <typename T>
+struct TensorBitOrConstantOp {
+  TensorBitOrConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in | val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v |= val;
+  }
+
+  const T val;
+};
+
+template <typename T>
+struct TensorBitXorConstantOp {
+  TensorBitXorConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in ^ val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v ^= val;
+  }
+
+  const T val;
+};
+
 #include "generic/THCTensorMathPairwise.cu"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh
index de96cad..6ab010a 100644
--- a/lib/THC/THCTensorMathPointwise.cuh
+++ b/lib/THC/THCTensorMathPointwise.cuh
@@ -415,11 +415,17 @@ struct TensorDivOp<half> {
 template <typename T>
 struct TensorCRemainderOp {
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *in != 0 ? *out - *in * (*out / *in) : NAN;
+    *out =  *out % *in;
+    if ((*out * *in)<0){
+      *out += *in;
+    }
   }
 
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in2 != 0 ? *in1 - *in2 * (*in1 / *in2) : NAN;
+    *out = *in1 % *in2;
+    if ((*out * *in2)<0){
+      *out += *in2;
+    }
   }
 };
 
@@ -660,4 +666,123 @@ struct TensorAddCDivOp {
   T val;
 };
 
+template <typename T>
+struct TensorLShiftOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out <<= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 << *in2;
+  }
+};
+
+template <>
+struct TensorLShiftOp<float> {
+  __device__ __forceinline__ void
+  operator()(float* out, float* in) {
+    *out *= powf(2.0f, *in);
+  }
+
+  __device__ __forceinline__ void
+  operator()(float* out, float* in1, float* in2) {
+    *out = *in1 * powf(2.0f, *in2);
+  }
+};
+
+template <>
+struct TensorLShiftOp<double> {
+  __device__ __forceinline__ void
+  operator()(double* out, double* in) {
+    *out *= pow(2.0, *in);
+  }
+
+  __device__ __forceinline__ void
+  operator()(double* out, double* in1, double* in2) {
+    *out = *in1 * pow(2.0, *in2);
+  }
+};
+
+template <typename T>
+struct TensorRShiftOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out >>= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 >> *in2;
+  }
+};
+
+
+template <>
+struct TensorRShiftOp<float> {
+  __device__ __forceinline__ void
+  operator()(float* out, float* in) {
+    *out /= powf(2.0f, *in);
+  }
+
+  __device__ __forceinline__ void
+  operator()(float* out, float* in1, float* in2) {
+    *out = *in1 / powf(2.0f, *in2);
+  }
+};
+
+template <>
+struct TensorRShiftOp<double> {
+  __device__ __forceinline__ void
+  operator()(double* out, double* in) {
+    *out /= pow(2.0, *in);
+  }
+
+  __device__ __forceinline__ void
+  operator()(double* out, double* in1, double* in2) {
+    *out = *in1 / pow(2.0, *in2);
+  }
+};
+
+template <typename T>
+struct TensorBitAndOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out &= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 & *in2;
+  }
+};
+
+template <typename T>
+struct TensorBitOrOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out |= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 | *in2;
+  }
+};
+
+template <typename T>
+struct TensorBitXorOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out ^= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 ^ *in2;
+  }
+};
+
+
 #endif // THC_TENSORMATH_POINTWISE_CUH
diff --git a/lib/THC/THCTensorMathReduce.cu b/lib/THC/THCTensorMathReduce.cu
index 446daec..1025366 100644
--- a/lib/THC/THCTensorMathReduce.cu
+++ b/lib/THC/THCTensorMathReduce.cu
@@ -2,7 +2,7 @@
 
 THC_API int
 THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self) {
-  THAssert(THCudaByteTensor_checkGPU(state, 1, self));
+  THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 1, self));
   unsigned char result;
   if (!THC_reduceAll(state, self,
                      thrust::identity<unsigned char>(),
@@ -17,7 +17,7 @@ THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self) {
 
 THC_API int
 THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self) {
-  THAssert(THCudaByteTensor_checkGPU(state, 1, self));
+  THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 1, self));
   unsigned char result;
   if (!THC_reduceAll(state, self,
                      thrust::identity<unsigned char>(),
diff --git a/lib/THC/THCTensorMathReduce.cuh b/lib/THC/THCTensorMathReduce.cuh
index 5fefbab..5051fbe 100644
--- a/lib/THC/THCTensorMathReduce.cuh
+++ b/lib/THC/THCTensorMathReduce.cuh
@@ -469,8 +469,8 @@ kernelTransformReduceOuterDimIndex(K *tgt1,
 
       for (unsigned col = 0; col < row_size; ++col) {
         // +1 for Lua index
-        acc = binary_op(thrust::make_pair<K, Index>(*src, col + TH_INDEX_BASE),
-                        acc);
+        acc = binary_op(acc,
+                        thrust::make_pair<K, Index>(*src, col + TH_INDEX_BASE));
         src += num_irows;
       }
 
@@ -550,7 +550,7 @@ kernelTransformReduceInnermostDimIndex(K *tgt1,
       K *src = src_ + row * row_size;
       // Sequential reduction within a thread.
       for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) {
-        acc = binary_op(thrust::make_pair<K, Index>(src[col], col + TH_INDEX_BASE), acc);
+        acc = binary_op(acc, thrust::make_pair<K, Index>(src[col], col + TH_INDEX_BASE));
       }
     }
 
@@ -625,6 +625,7 @@ THC_reduceDimIndex(THCState *state,
                    TensorTypeIndex *tgt2_,
                    TensorTypeK *src,
                    long dimension,
+                   int keepdim,
                    const thrust::pair<
                    typename TensorUtils<TensorTypeK>::DataType,
                    typename TensorUtils<TensorTypeIndex>::DataType>& init,
@@ -653,6 +654,10 @@ THC_reduceDimIndex(THCState *state,
   TensorUtils<TensorTypeK>::free(state, src);
   TensorUtils<TensorTypeK>::freeCopyTo(state, tgt1, tgt1_);
   TensorUtils<TensorTypeIndex>::freeCopyTo(state, tgt2, tgt2_);
+  if (!keepdim) {
+    TensorUtils<TensorTypeK>::squeeze1d(state, tgt1_, tgt1_, dimension);
+    TensorUtils<TensorTypeIndex>::squeeze1d(state, tgt2_, tgt2_, dimension);
+  }
 }
 
 template <typename T, typename Index>
diff --git a/lib/THC/THCTensorMathScan.cu b/lib/THC/THCTensorMathScan.cu
index 3345e25..6f01bd2 100644
--- a/lib/THC/THCTensorMathScan.cu
+++ b/lib/THC/THCTensorMathScan.cu
@@ -6,6 +6,8 @@
 #include "THCReduce.cuh"
 #include "THCNumerics.cuh"
 #include "THCTensorMathReduce.cuh"
+#include <thrust/scan.h>
+#include <thrust/execution_policy.h>
 
 /* Perform an inclusive scan along an outer dimension of a tensor.
  *
@@ -20,8 +22,8 @@
  */
 template<typename T, class BinaryOp>
 __global__ void THCTensor_kernel_scanOuterDim(T *tgt_, T *src_,
-                                                 unsigned num_orows, unsigned num_irows, unsigned row_size,
-                                                 T init, BinaryOp binary_op)
+                                              unsigned num_orows, unsigned num_irows, unsigned row_size,
+                                              T init, BinaryOp binary_op)
 {
   for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
     for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
@@ -52,8 +54,8 @@ __global__ void THCTensor_kernel_scanOuterDim(T *tgt_, T *src_,
  */
 template<typename T, int num_threads_x, int num_threads_y, class BinaryFunction>
 __global__ void THCTensor_kernel_scanInnermostDim(T *tgt_, T *src_,
-                                                     unsigned num_rows, unsigned row_size,
-                                                     T init, BinaryFunction binary_op)
+                                                  unsigned num_rows, unsigned row_size,
+                                                  T init, BinaryFunction binary_op)
 {
   __shared__ T sbuf[num_threads_y][2 * num_threads_x];
 
diff --git a/lib/THC/THCTensorMode.cu b/lib/THC/THCTensorMode.cu
new file mode 100644
index 0000000..aa6c628
--- /dev/null
+++ b/lib/THC/THCTensorMode.cu
@@ -0,0 +1,16 @@
+#include "THC.h"
+#include "THCThrustAllocator.cuh"
+#include "THCTensorTypeUtils.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/inner_product.h>
+#include <thrust/device_vector.h>
+#include <thrust/extrema.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include "THCTensorMode.cuh"
+
+#include "generic/THCTensorMode.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMode.cuh b/lib/THC/THCTensorMode.cuh
new file mode 100644
index 0000000..b67ac2a
--- /dev/null
+++ b/lib/THC/THCTensorMode.cuh
@@ -0,0 +1,282 @@
+#ifndef THC_TENSOR_MODE_CUH
+#define THC_TENSOR_MODE_CUH
+
+#include "THCNumerics.cuh"
+#include "THCSortUtils.cuh"
+#include "THCScanUtils.cuh"
+
+struct ThrustHalfLess
+{
+  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
+    return THCNumerics<half>::lt(lhs, rhs);
+  }
+};
+
+struct ThrustHalfNotEqualTo
+{
+  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
+    return THCNumerics<half>::ne(lhs, rhs);
+  }
+};
+
+struct ThrustHalfEqualTo
+{
+  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
+    return THCNumerics<half>::eq(lhs, rhs);
+  }
+};
+
+struct ThrustHalfEqualToPredicate
+{
+  ThrustHalfEqualToPredicate(half val): val_(val) {}
+  __host__ __device__ inline bool operator()(half x) {
+    return THCNumerics<half>::eq(val_, x);
+  }
+
+  half val_;
+};
+
+template <typename T>
+struct BinaryAddOp {
+  __host__ __device__ inline T operator()(const T a, const T b) {
+    return THCNumerics<T>::add(a, b);
+  }
+};
+
+template <>
+struct BinaryAddOp<unsigned int> {
+  __host__ __device__ inline unsigned int operator()(const unsigned int a, const unsigned int b) {
+    return a + b;
+  }
+};
+
+// Used for a segmented reduction
+struct ModeUnsignedBoolPair {
+  unsigned int val;
+  bool flag;
+};
+
+// In the kernel below, we have a common pattern of reducing (unsigned int, unsigned int)
+// pairs of data
+struct ModeUnsignedPair {
+  unsigned int val;
+  unsigned int index;
+};
+
+template <typename T>
+struct MaxReduceOp {
+  __host__ __device__ inline T operator()(const T& a, const T& b) {
+    return b.val > a.val ? b : a;
+  }
+};
+
+template <typename T>
+struct MatchReduceOp {
+  __host__ __device__ inline T operator()(const T& a, const T& b) {
+    return b.flag ? b : a;
+  }
+};
+
+// The mode kernel has the following characteristics: It uses internal shared memory
+// buffers of Power2Size, which must be greater than the number of elements. Additionally,
+// there is one block for every slice to calculate the mode for, and in each block there
+// is one thread for every two elements.
+//
+// Both sorted and positions are assumed to be contiguous Tensors with the mode dimension
+// as the innermost dim, such that we can get the particular slice for a Tensor via its
+// linear block dimension * the slice size.
+template <typename T, unsigned int Power2Size>
+__global__ void computeMode(
+    T *input,
+    TensorInfo<T, unsigned int> values,
+    TensorInfo<long, unsigned int> indices,
+    long sliceSize)
+{
+  int tidx = threadIdx.x;
+  int stidx = blockDim.x + threadIdx.x; // Second index this thread responsible for
+
+  // First, we need to calculate the offset into the sorted Tensor that represents
+  // the start of the slice for this block to calculate the mode for. This offset
+  // is a combination of the gridIndices, and the number of elements in the slice.
+  unsigned int blockId = getLinearBlockId<unsigned int>();
+  unsigned int linearOffset = blockId * sliceSize;
+
+  // shmem is a dynamically sized buffer we will use throughout the kernel to
+  // handle computation efficiently. The size of this shmem must be
+  // sizeof(T) * Power2Size + (2 * sizeof(unsigned int) * Power2Size)
+  //
+  // Initially, the buffer will be organized as follows:
+  //
+  // [smem (slice elements) | bmem (valid indices) | <scratch space>]
+  extern __shared__ char shmem[];
+
+  // smem represents a proportion of the shared memory buffer that is used to store
+  // the elements from the slice:
+  T *smem = reinterpret_cast<T *>(shmem);
+
+  // Each thread loads up to two elements from the Tensor into shared memory
+  if (tidx < sliceSize) {
+    smem[tidx] = input[linearOffset + tidx];
+  }
+  if (stidx < sliceSize) {
+    smem[stidx] = input[linearOffset + stidx];
+  }
+
+  // Next, we initialize a boolean region of the buffer, offset by the loaded element
+  // smem region
+  bool *bmem = reinterpret_cast<bool *>(&smem[Power2Size]);
+
+  // The first use of this region stores bmem[i] = i < sliceSize to mark the valid
+  // components in the smem buffer
+  bmem[tidx] = tidx < sliceSize;
+  bmem[stidx] = stidx < sliceSize;
+  __syncthreads(); // barrier for smem, bmem initialization
+
+  // First, sort the input slice in ascending order. smem contains the input
+  // elements, and bmem marks the valid indices
+  bitonicSortKeys<LTComp<T>, T, unsigned int, Power2Size>(smem, bmem, LTComp<T>());
+  __syncthreads(); // make no assumptions that the sort syncs at end
+
+  // The next step of our algorithm is performing a block-wide comparison of
+  // neighboring elements. In particular, given an sorted input slice A, we
+  // produce an output slice B, such that B[i] = 1 if A[i-i] != A[i], otherwise 0.
+  //
+  // Given the input A = [0, 0, 1, 1, 2, 2, 2, 4, 5, 6, 6, 7, 8]
+  //                 B = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+  //
+  // In particular, we can think of B[i] true indicating the start of a sequence of
+  // equal values in the sorted list. Similarly, we will also store the negation of B,
+  // which we'll call C. In particular, we can think of C[i] = true iff A[i-1] == A[i]
+  // in our original sorted slice.
+  //
+  //                 C = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+
+  // We overwrite bmem, and treat the rest of shared memory as a buffer of (index, flag) pairs
+  // where the index represents values from C, and the flag represents values from B.
+  //
+  // [smem (sorted slice) | ubpmem (index, flag pairs)]
+
+  struct ModeUnsignedBoolPair *ubpmem = reinterpret_cast<struct ModeUnsignedBoolPair *>(
+      &smem[Power2Size]);
+
+  if (tidx == 0) {
+    ubpmem[0].flag = true;
+    ubpmem[0].val = 0;
+  }
+
+  // Compares elements (0, 1), (2, 3), ... and sets 1, 3, ...
+  ubpmem[tidx * 2 + 1].flag = THCNumerics<T>::ne(smem[tidx * 2], smem[tidx * 2 + 1]); // (0, 1), (1, 2), etc.
+  ubpmem[tidx * 2 + 1].val = !ubpmem[tidx * 2 + 1].flag;
+
+  // Compares elements (1, 2), (3, 4), ... and sets 2, 4, ...
+  if (((tidx + 1) * 2) < Power2Size) {
+    ubpmem[(tidx + 1) * 2].flag = THCNumerics<T>::ne(smem[((tidx + 1) * 2) - 1], smem[(tidx + 1) * 2]);
+    ubpmem[(tidx + 1) * 2].val = !ubpmem[(tidx + 1) * 2].flag;
+  }
+  __syncthreads(); // barrier for ubpmem initialization
+
+  // Next, we perform a segmented prefix sum on the neighboring elements, where
+  // the presence of a one indicates the start of a segment. In this case B acts
+  // as the segment start flags, and C is the buffer to be summed:
+  //
+  // Input  (C)  = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+  // Flag   (B)  = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+  // Output (C)  = [0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0]
+  //
+  // Afterwards, the (index) components of the ubpmem buffer contain the lengths of the
+  // segments (minus 1), i.e. the counts of each element in the original input.
+
+  inclusivePrefixScan<
+    struct ModeUnsignedBoolPair,
+    struct SegmentedScanOp<struct ModeUnsignedBoolPair, BinaryAddOp<unsigned int> >,
+    Power2Size>(
+        ubpmem,
+        SegmentedScanOp<struct ModeUnsignedBoolPair, BinaryAddOp<unsigned int> >(BinaryAddOp<unsigned int>()));
+  // assumes scan syncs at the end
+
+  // Next, we reinterpret the ubpmem buffer as pairs of unsigned integers (i.e. we treat the
+  // boolean flag regions as integers). We initialize these to represent indices, and we'll call
+  // this buffer I
+  struct ModeUnsignedPair *uupmem = reinterpret_cast<struct ModeUnsignedPair *>(ubpmem);
+
+  // At this point, we need to find the maximum element in lengths buffer C.
+  // This element will represent the count (-1) of the mode. Because of the
+  // way we have set up the problem, the index where this mode occurs will
+  // also be the location of the mode value in the sorted array, e.g.
+  //
+  // smem = [0, 0, 1, 1, 1, 2]
+  // C    = [0, 1, 0, 1, 2, 0]
+  // I    = [0, 1, 2, 3, 4, 5]
+  //                     ^
+  //                     maximum value, also aligned with mode = 1
+  //
+  // We perform a block wide max-reduction of the C buffer, but we also need the
+  // indices to come along with it, so we utilize the uupmem construction.
+  //
+  // At the end we need to return the ModeUnsignedPair containing index = 4, val = 2,
+  // which represents the max
+
+  // In practice, we will make each thread locally reduce 2 values in its registers prior
+  // to the global block-wide reduction. Note that instead of tidx/stidx, we utilize tidx * 2,
+  // tidx * 2 + 1, so each thread deals with adjacent elements. This is because the reduce
+  // code below relies on thread elements to be adjacent.
+  struct ModeUnsignedPair uup[2];
+  uup[0].index = tidx * 2;
+  uup[0].val = ubpmem[tidx * 2].val;
+  uup[1].index = tidx * 2 + 1;
+  uup[1].val = ubpmem[tidx * 2 + 1].val;
+  __syncthreads();
+
+  struct ModeUnsignedPair max = {0, 0};
+
+  max = reduceBlockWithNThreadLocalReductions<struct ModeUnsignedPair, MaxReduceOp<struct ModeUnsignedPair>, 2>
+    (uupmem, uup, sliceSize, MaxReduceOp<struct ModeUnsignedPair>(), max);
+
+  // Store the mode in shared memory for use in finding the mode in the input slice
+  __shared__ T  mode;
+
+  // Given the above constraints, the mode is the value at the reduced index in the
+  // original sorted element buffer
+  if (tidx == 0) {
+    mode = smem[max.index];
+  }
+  __syncthreads(); // broadcast mode
+
+  // Finally, we need to find the "an" index of the mode in the input Tensor. The API does
+  // not constrain which index we pick, so it can be any of the indices that contain the mode.
+  // We will do a reduction to find the index. We go back to using the (index, flag) buffer
+  // arrangment. First, we mark indices that are equal to the mode, i.e B[i] = true if
+  // input[i] == mode, and initialize C[i] to be the index
+  //
+  // Again we reduce 2 elements in the thread's registers prior to the block-wide reduction
+  struct ModeUnsignedBoolPair ubpp[2];
+  if (tidx * 2 < sliceSize) {
+    ubpp[0].flag = THCNumerics<T>::eq(input[linearOffset + (tidx * 2)], mode);
+    ubpp[0].val = tidx * 2;
+  }
+  if (tidx * 2 + 1 < sliceSize) {
+    ubpp[1].flag = THCNumerics<T>::eq(input[linearOffset + (tidx * 2 + 1)], mode);
+    ubpp[1].val = tidx * 2 + 1;
+  }
+
+  // Then we perform a similar reduction to the one above, except this time we update
+  // the element if the element at the base position is not equal to the mode and
+  // the element at the offset position is. At the end, C[0] will contain an index
+  // with the mode.
+  struct ModeUnsignedBoolPair match = {0, false};
+
+  match = reduceBlockWithNThreadLocalReductions<struct ModeUnsignedBoolPair, MatchReduceOp<struct ModeUnsignedBoolPair>, 2>
+    (ubpmem, ubpp, sliceSize, MatchReduceOp<struct ModeUnsignedBoolPair>(), match);
+
+  // Finally, we have the mode, and an index where it occurs. We use a single thread
+  // to place this in the appropriate output position
+  if (tidx == 0) {
+    long index = TH_INDEX_BASE + match.val;
+
+    unsigned int outputOffset = IndexToOffset<T, unsigned int, -1>::get(blockId, values);
+    values.data[outputOffset] = mode;
+    indices.data[outputOffset] = index;
+  }
+}
+
+#endif // THC_TENSOR_MODE_CUH
diff --git a/lib/THC/THCTensorRandom.cuh b/lib/THC/THCTensorRandom.cuh
index d78409f..5afd8fe 100644
--- a/lib/THC/THCTensorRandom.cuh
+++ b/lib/THC/THCTensorRandom.cuh
@@ -97,44 +97,56 @@ __device__ int binarySearchForMultinomial(T* dist,
   return start;
 }
 
-template <typename T>
+template <typename T, typename AccT>
 __global__ void
 sampleMultinomialOnce(long* dest,
                       long distributions,
                       int categories,
                       T* sampled,
                       T* dist) {
-  extern __shared__ __align__(sizeof(T)) unsigned char my_smem[];
+  extern __shared__ __align__(sizeof(AccT)) unsigned char my_smem[];
+  __shared__ bool found;
+
+  // Shared Memory hold blockdim.x T for holding the cumulative sum,
+  // blockDim.x AccT for normalizing the probabilities,
   T *smem = reinterpret_cast<T *>(my_smem);
+  AccT *asmem = reinterpret_cast<AccT *>(&my_smem[blockDim.x * sizeof(T)]);
+
+  AccT accZero = ScalarConvert<int, AccT>::to(0);
   T zero = ScalarConvert<int, T>::to(0);
 
   for (long curDist = blockIdx.x;
        curDist < distributions; curDist += gridDim.x) {
     // Each block handles one distribution
     // First pass, find the total sum of the distribution
-    T sum = zero;
+    AccT sum = accZero;
     for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) {
-      sum = THCNumerics<T>::add(sum, dist[curDist * categories + cat]);
+      sum = THCNumerics<AccT>::add(
+        sum,
+        ScalarConvert<T, AccT>::to(dist[curDist * categories + cat]));
     }
 
     // threadIdx.x == 0 has the sum value from this
-    sum = reduceBlock(smem, blockDim.x, sum, ReduceAdd<T, T>(), zero);
+    sum = reduceBlock(asmem, blockDim.x, sum, ReduceAdd<AccT, AccT>(), accZero);
 
     // Broadcast sum and sample value
     if (threadIdx.x == 0) {
-      smem[0] = sum;
-      smem[1] = sampled[curDist];
+      // Make sure the sum of our distribution didn't overflow
+      assert(!isinf(sum));
+
+      asmem[0] = sum;
+      smem[0] = sampled[curDist];
     }
     __syncthreads();
 
-    sum = smem[0];
-    T sample = smem[1];
+    sum = asmem[0];
+    T sample = smem[0];
     __syncthreads();
 
-    if (THCNumerics<T>::eq(sum,  zero) || THCNumerics<T>::eq(sample, zero)) {
+    if (THCNumerics<AccT>::eq(sum,  accZero) || THCNumerics<T>::eq(sample, zero)) {
       // Choose the first element
       if (threadIdx.x == 0) {
-        dest[curDist] = 1;
+        dest[curDist] = TH_INDEX_BASE;
       }
 
       continue;
@@ -142,16 +154,20 @@ sampleMultinomialOnce(long* dest,
 
     int chunks = THCCeilDiv(categories, (int) blockDim.x);
     T prevHighProb = zero;
+    found = false;
 
-    for (int chunk = 0; chunk < chunks; ++chunk) {
+    for (int chunk = 0; chunk < chunks && !found; ++chunk) {
       // All threads in bounds load a value
       int cat = chunk * blockDim.x + threadIdx.x;
 
-      T val =
-        cat < categories ? THCNumerics<T>::div(dist[curDist * categories + cat], sum) :
-        zero;
+      AccT val =
+        cat < categories ?
+          THCNumerics<AccT>::div(
+              ScalarConvert<T, AccT>::to(dist[curDist * categories + cat]),
+              sum) :
+          accZero;
 
-      smem[threadIdx.x] = val;
+      smem[threadIdx.x] = ScalarConvert<AccT, T>::to(val);
       __syncthreads();
 
       // Perform an inclusive prefix sum of the shared memory contents
@@ -183,8 +199,8 @@ sampleMultinomialOnce(long* dest,
       if (inBucket) {
         // We're done; we have the sample
         // Torch indices are 1-based
-        // FIXME: broadcast exit flag?
         dest[curDist] = cat + TH_INDEX_BASE;
+        found = true;
       }
 
       // Store the previous scan's high value for future use
@@ -192,6 +208,21 @@ sampleMultinomialOnce(long* dest,
 
       __syncthreads();
     }
+
+    if (threadIdx.x == 0 && !found) {
+      // This should address a rare bug where we don't select a valid index. This likely occurs when
+      // due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but
+      // and our uniform sample is greater than this value. In this case we likely have unitialized memory
+      // in dest[curDist]. So basically we will loop through the distribution and pick the largest index
+      // where the distribution is non-zero. This is obviously terribly inefficient, but due to the
+      // rarity in which this occurs, this should not be an issue.
+      for (int cat = categories - 1; cat >= 0; --cat) {
+        if (THCNumerics<T>::gt(dist[curDist * categories + cat], zero)) {
+          dest[curDist] = cat + TH_INDEX_BASE;
+          break;
+        }
+      }
+    }
   }
 }
 
diff --git a/lib/THC/THCTensorScatterGather.cu b/lib/THC/THCTensorScatterGather.cu
index f3f3928..18c9dee 100644
--- a/lib/THC/THCTensorScatterGather.cu
+++ b/lib/THC/THCTensorScatterGather.cu
@@ -92,7 +92,8 @@ __global__ void THCudaTensor_gatherKernel(
                                                           tensor, &tensorOffset,
                                                           src, &srcOffset);
 
-    IndexType indexValue = (IndexType)index.data[indexOffset] - TH_INDEX_BASE;
+    long indexValue = index.data[indexOffset] - TH_INDEX_BASE;
+    assert(indexValue >= 0 && indexValue < src.sizes[dim]);
     srcOffset += indexValue * src.strides[dim];
 
     tensor.data[tensorOffset] = src.data[srcOffset];
@@ -118,7 +119,8 @@ __global__ void THCudaTensor_scatterKernel(
                                                           src, &srcOffset,
                                                           tensor, &tensorOffset);
 
-    IndexType indexValue = (IndexType)index.data[indexOffset] - TH_INDEX_BASE;
+    long indexValue = index.data[indexOffset] - TH_INDEX_BASE;
+    assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
     tensorOffset += indexValue * tensor.strides[dim];
 
     tensor.data[tensorOffset] = src.data[srcOffset];
@@ -142,7 +144,8 @@ __global__ void THCudaTensor_scatterFillKernel(
                                                           index, &indexOffset,
                                                           tensor, &tensorOffset);
 
-    IndexType indexValue = (IndexType)index.data[indexOffset] - TH_INDEX_BASE;
+    long indexValue = index.data[indexOffset] - TH_INDEX_BASE;
+    assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
     tensorOffset += indexValue * tensor.strides[dim];
 
     tensor.data[tensorOffset] = value;
diff --git a/lib/THC/THCTensorSort.cu b/lib/THC/THCTensorSort.cu
index 589d3e9..f5f7338 100644
--- a/lib/THC/THCTensorSort.cu
+++ b/lib/THC/THCTensorSort.cu
@@ -1,21 +1,5 @@
 #include "THCTensorSort.cuh"
 
-// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
-unsigned long nextHighestPowerOf2(unsigned long n) {
-  n--;
-  n |= n >> 1;
-  n |= n >> 2;
-  n |= n >> 4;
-  n |= n >> 8;
-  n |= n >> 16;
-#ifndef _MSC_VER
-  n |= n >> 32;
-#endif
-  n++;
-
-  return n;
-}
-
 void THCudaLongTensor_fillSliceWithIndex(THCState* state,
                                          THCudaLongTensor* t,
                                          int dim) {
diff --git a/lib/THC/THCTensorSort.cuh b/lib/THC/THCTensorSort.cuh
index 381f111..d47ee20 100644
--- a/lib/THC/THCTensorSort.cuh
+++ b/lib/THC/THCTensorSort.cuh
@@ -80,7 +80,6 @@ struct GlobalIndexToPerSliceIndex {
   const long sliceSize;
 };
 
-unsigned long nextHighestPowerOf2(unsigned long n);
 void THCudaLongTensor_fillSliceWithIndex(THCState* state,
                                          THCudaLongTensor* t,
                                          int dim);
diff --git a/lib/THC/THCTensorTopK.cu b/lib/THC/THCTensorTopK.cu
index ec26178..325d560 100644
--- a/lib/THC/THCTensorTopK.cu
+++ b/lib/THC/THCTensorTopK.cu
@@ -5,531 +5,15 @@
 #include "THCAsmUtils.cuh"
 #include "THCScanUtils.cuh"
 #include "THCTensorTypeUtils.cuh"
+#include "THCTensorMathReduce.cuh"
 #include <algorithm> // for std::min
 
 #if CUDA_VERSION >= 7000
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 
-// Converts a float to an integer representation with the same
-// sorting; i.e., for floats f1, f2:
-// if f1 < f2 then convert(f1) < convert(f2)
-// We use this to enable radix selection of floating-point values.
-// This also gives a relative order for NaNs, but that's ok, as they
-// will all be adjacent
-struct FloatToSortedInt {
-  inline __host__ __device__ FloatToSortedInt() {}
+#include "THCTensorTopK.cuh"
 
-  inline __device__ unsigned int convert(float v) const {
-    unsigned int x = __float_as_int(v);
-    unsigned int mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
+#include "generic/THCTensorTopK.cu"
+#include "THCGenerateAllTypes.h"
 
-    return (x ^ mask);
-  }
-
-  inline __device__ float deconvert(unsigned int v) const {
-    unsigned int mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
-
-    return __int_as_float(v ^ mask);
-  }
-};
-
-// This function counts the distribution of all input values in a
-// slice we are selecting by radix digit at `radixDigitPos`, but only
-// those that pass the filter `((v & desiredMask) == desired)`.
-// This produces and broadcasts the seen counts for a single block only.
-// `smem` must have at least `RadixSize` elements.
-template <typename DataType, typename BitDataType,
-          typename IndexType, typename CountType,
-          typename RadixConverter, int RadixSize, int RadixBits>
-__device__ void countRadixUsingMask(const RadixConverter& conv,
-                                    CountType counts[RadixSize],
-                                    CountType* smem,
-                                    BitDataType desired,
-                                    BitDataType desiredMask,
-                                    int radixDigitPos,
-                                    IndexType sliceSize,
-                                    IndexType withinSliceStride,
-                                    DataType* data) {
-  // Clear out per-thread counts from a previous round
-#pragma unroll
-  for (int i = 0; i < RadixSize; ++i) {
-    counts[i] = 0;
-  }
-
-  if (threadIdx.x < RadixSize) {
-    smem[threadIdx.x] = 0;
-  }
-  __syncthreads();
-
-  // Scan over all the data. Upon a read, the warp will accumulate
-  // counts per each digit in the radix using warp voting.
-  for (IndexType i = threadIdx.x; i < sliceSize; i += blockDim.x) {
-    BitDataType val = conv.convert(doLdg(&data[i * withinSliceStride]));
-
-    bool hasVal = ((val & desiredMask) == desired);
-    unsigned int digitInRadix = getBitfield(val, radixDigitPos, RadixBits);
-
-#pragma unroll
-    for (unsigned int j = 0; j < RadixSize; ++j) {
-      bool vote = hasVal && (digitInRadix == j);
-      counts[j] += __popc(__ballot(vote));
-    }
-  }
-
-  // Now, for each warp, sum values
-  if (getLaneId() == 0) {
-#pragma unroll
-    for (unsigned int i = 0; i < RadixSize; ++i) {
-      atomicAdd(&smem[i], counts[i]);
-    }
-  }
-
-  __syncthreads();
-
-  // For each thread, read in the total counts
-#pragma unroll
-  for (unsigned int i = 0; i < RadixSize; ++i) {
-    counts[i] = smem[i];
-  }
-
-  __syncthreads();
-}
-
-// Over what radix we are selecting values
-#define RADIX_BITS 2 // digits are base-(2 ^ RADIX_BITS)
-#define RADIX_SIZE 4 // 2 ^ RADIX_BITS
-#define RADIX_MASK (RADIX_SIZE - 1)
-
-// This finds the unique value `v` that matches the pattern
-// ((v & desired) == desiredMask) in our sorted int format
-template <typename DataType, typename IndexType, typename RadixConverter>
-__device__ float findPattern(const RadixConverter& conv,
-                             DataType* smem,
-                             DataType* data,
-                             IndexType sliceSize,
-                             IndexType withinSliceStride,
-                             unsigned int desired,
-                             unsigned int desiredMask) {
-  if (threadIdx.x < 32) {
-    smem[threadIdx.x] = (DataType) 0;
-  }
-  __syncthreads();
-
-  // All threads participate in the loop, in order to sync on the flag
-  IndexType numIterations = THCRoundUp(sliceSize, (IndexType) blockDim.x);
-  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
-    bool inRange = (i < sliceSize);
-    DataType v = inRange ? doLdg(&data[i * withinSliceStride]) : (DataType) 0;
-
-    if (inRange && ((conv.convert(v) & desiredMask) == desired)) {
-      // There should not be conflicts if we are using findPattern,
-      // since the result is unique
-      smem[0] = (DataType) 1;
-      smem[1] = v; // can't use val as the flag, since it could be 0
-    }
-
-    __syncthreads();
-
-    DataType found = smem[0];
-    DataType val = smem[1];
-
-    __syncthreads();
-
-    // Check to see if a thread found the value
-    if (found != (DataType) 0) {
-      // all threads return this value
-      return val;
-    }
-  }
-
-  // should not get here
-  assert(false);
-  return (DataType) 0;
-}
-
-// Returns the top-Kth element found in the data using radix selection
-template <typename DataType, typename BitDataType, typename IndexType,
-          typename RadixConverter, bool Order>
-__device__ void radixSelect(const RadixConverter& conv,
-                            DataType* data,
-                            IndexType k,
-                            IndexType sliceSize,
-                            IndexType withinSliceStride,
-                            int* smem,
-                            DataType* topK) {
-  // Per-thread buckets into which we accumulate digit counts in our
-  // radix
-  int counts[RADIX_SIZE];
-
-  // We only consider elements x such that (x & desiredMask) == desired
-  // Initially, we consider all elements of the array, so the above
-  // statement is true regardless of input.
-  unsigned int desired = 0;
-  unsigned int desiredMask = 0;
-
-  // We are looking for the top kToFind-th element when iterating over
-  // digits; this count gets reduced by elimination when counting
-  // successive digits
-  int kToFind = k;
-
-  // We start at the most significant digit in our radix, scanning
-  // through to the least significant digit
-#pragma unroll
-  for (int digitPos = sizeof(BitDataType) * 8 - RADIX_BITS;
-       digitPos >= 0;
-       digitPos -= RADIX_BITS) {
-
-    // Count radix distribution for the current position and reduce
-    // across all threads
-    countRadixUsingMask<DataType, BitDataType,
-                        IndexType, int, RadixConverter,
-                        RADIX_SIZE, RADIX_BITS>(
-                          conv, counts, smem,
-                          desired, desiredMask, digitPos,
-                          sliceSize, withinSliceStride, data);
-
-    // All threads participate in the comparisons below to know the
-    // final result
-
-#define CHECK_RADIX(i)                                                  \
-    int count = counts[i];                                              \
-                                                                        \
-    /* All threads have the same value in counts here, so all */        \
-    /* threads will return from the function. */                        \
-    if (count == 1 && kToFind == 1) {                                   \
-      /* There is a unique answer. */                                   \
-      desired = setBitfield(desired, i, digitPos, RADIX_BITS);          \
-      desiredMask =                                                     \
-        setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS);     \
-                                                                        \
-      /* The answer is now the unique element v such that: */           \
-      /* (v & desiredMask) == desired */                                \
-      /* However, we do not yet know what the actual element is. We */  \
-      /* need to perform a search through the data to find the */       \
-      /* element that matches this pattern. */                          \
-      *topK = findPattern<DataType, IndexType, RadixConverter>(         \
-        conv, (float*) smem, data, sliceSize,                           \
-        withinSliceStride, desired, desiredMask);                       \
-      return;                                                           \
-    }                                                                   \
-                                                                        \
-    if (count >= kToFind) {                                             \
-      desired = setBitfield(desired, i, digitPos, RADIX_BITS);          \
-      desiredMask =                                                     \
-        setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS);     \
-                                                                        \
-      /* The top-Kth element v must now be one such that: */            \
-      /* (v & desiredMask == desired) */                                \
-      /* but we haven't narrowed it down; we must check the next */     \
-      /* least-significant digit */                                     \
-      break;                                                            \
-    }                                                                   \
-                                                                        \
-    kToFind -= count                                                    \
-
-    if (Order) {
-      // Process in descending order
-#pragma unroll
-      for (int i = RADIX_SIZE - 1; i >= 0; --i) {
-        CHECK_RADIX(i);
-      }
-    } else {
-      // Process in ascending order
-#pragma unroll
-      for (int i = 0; i < RADIX_SIZE; ++i) {
-        CHECK_RADIX(i);
-      }
-    }
-#undef CHECK_RADIX
-  } // end digitPos for
-
-  // There is no unique result, but there is a non-unique result
-  // matching `desired` exactly
-  *topK = conv.deconvert(desired);
-}
-
-template <typename IndexType, int Dim, bool Order>
-__global__ void gatherTopK(TensorInfo<float, IndexType> input,
-                           IndexType inputSliceSize,
-                           IndexType outputSliceSize, // aka `k`
-
-                           IndexType numInputSlices,
-                           IndexType inputWithinSliceStride,
-
-                           TensorInfo<float, IndexType> topK,
-                           IndexType numTopKSlices,
-                           IndexType topKWithinSliceStride,
-
-                           TensorInfo<long, IndexType> indices,
-                           IndexType indicesWithinSliceStride) {
-  // Indices are limited to integer fp precision, so counts can fit in
-  // int32, regardless of IndexType
-  __shared__ int smem[32]; // one per each warp, up to warp limit
-
-  IndexType slice = getLinearBlockId<IndexType>();
-  if (slice >= numInputSlices) {
-    return;
-  }
-
-  // Find the start offset for our slice
-  IndexType sliceStartIndex =
-    IndexToOffset<float, IndexType, Dim>::get(slice, input);
-  IndexType topKSliceStartIndex =
-    IndexToOffset<float, IndexType, Dim>::get(slice, topK);
-  IndexType indicesSliceStartIndex =
-    IndexToOffset<long, IndexType, Dim>::get(slice, indices);
-
-  float* inputSliceStart = &input.data[sliceStartIndex];
-  float* topKSliceStart = &topK.data[topKSliceStartIndex];
-  long* indicesSliceStart = &indices.data[indicesSliceStartIndex];
-
-  // Find the k-th highest element in our input
-  float topKValue = -1.0f;
-  radixSelect<float, unsigned int, IndexType, FloatToSortedInt, Order>(
-    FloatToSortedInt(),
-    inputSliceStart, outputSliceSize,
-    inputSliceSize, inputWithinSliceStride,
-    smem, &topKValue);
-
-  // Every value that is strictly less/greater than `pattern`
-  // (depending on sort dir) in sorted int format is in the top-K.
-  // The top-K value itself might not be unique.
-  //
-  // Since there are a variable number of elements that we see that
-  // are within the top-k, we don't know at what index to write out
-  // the resulting values.
-  // In order to get this, we perform an exclusive prefix sum of
-  // `hasTopK`. This will return the resulting index into which we
-  // need to write the result, if a thread has a result.
-
-  // All threads need to participate in the loop and the prefix sum,
-  // but not necessarily in the load; hence loop bounds being rounded
-  // up to a multiple of the block dim.
-  IndexType numIterations = THCRoundUp(inputSliceSize, (IndexType) blockDim.x);
-  IndexType writeIndexStart = 0;
-
-  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
-    bool inRange = (i < inputSliceSize);
-    float v =
-      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : 0.0f;
-    bool hasTopK;
-    if (Order) {
-      hasTopK = inRange && (v > topKValue);
-    } else {
-      hasTopK = inRange && (v < topKValue);
-    }
-
-    int index;
-    int carry;
-    exclusiveBinaryPrefixSum<int, true>(smem, hasTopK, &index, &carry);
-
-    if (hasTopK) {
-      int writeIndex = writeIndexStart + index;
-      assert(writeIndex < outputSliceSize);
-
-      IndexType topKOffset = writeIndex * topKWithinSliceStride;
-      IndexType indexOffset = writeIndex * indicesWithinSliceStride;
-
-      topKSliceStart[topKOffset] = v;
-      indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
-    }
-
-    writeIndexStart += carry;
-  }
-
-  // We need to fill in the rest with actual == top-K values.
-  // The number that we need is outputSliceSize -
-  // writeIndexStart. There might be more than that number available,
-  // in which case we have to choose the first seen set. We do this
-  // via a prefix sum to calculate indices for writing results.
-  assert(outputSliceSize >= writeIndexStart);
-  IndexType topKRemaining = (outputSliceSize - writeIndexStart);
-
-  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
-    bool inRange = (i < inputSliceSize);
-    float v =
-      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : 0.0f;
-    bool hasTopK = inRange && (v == topKValue);
-
-    int index;
-    int carry;
-    exclusiveBinaryPrefixSum<int, true>(smem, hasTopK, &index, &carry);
-
-    if (hasTopK && index < topKRemaining) {
-      int writeIndex = writeIndexStart + index;
-      assert(writeIndex < outputSliceSize);
-
-      IndexType topKOffset = writeIndex * topKWithinSliceStride;
-      IndexType indexOffset = writeIndex * indicesWithinSliceStride;
-
-      topKSliceStart[topKOffset] = v;
-      indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
-    }
-
-    if (carry >= topKRemaining) {
-      break;
-    }
-
-    topKRemaining -= carry;
-    writeIndexStart += carry;
-  }
-}
-
-#undef RADIX_BITS
-#undef RADIX_SIZE
-#undef RADIX_MASK
-
-THC_API void THCudaTensor_topk(THCState* state,
-                               THCudaTensor *topK,
-                               THCudaLongTensor *indices,
-                               THCudaTensor *input,
-                               long k, int dim, int dir, int sorted) {
-  THAssert(topK != NULL && indices != NULL && input != NULL);
-  THAssert(THCudaTensor_checkGPU(state, 3, topK, indices, input));
-  THCCheckTensorDims(state, topK, 2);
-  long dims = THCudaLongTensor_nDimension(state, indices);
-  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THCCheckTensorDims(state, input, 2);
-
-  int numDims = THCudaTensor_nDimension(state, input);
-  THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
-
-  long sliceSize = THCudaTensor_size(state, input, dim);
-  THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension");
-
-  // Build the output size, which is the dim being selected set to
-  // size k
-  THLongStorage* topKSize = THCudaTensor_newSizeOf(state, input);
-  THLongStorage_set(topKSize, dim, k);
-  THCudaTensor_resize(state, topK, topKSize, NULL);
-  THCudaLongTensor_resize(state, indices, topKSize, NULL);
-  THLongStorage_free(topKSize);
-
-#define RUN_K(INDEX_T, DIM, DIR)                                        \
-  gatherTopK<INDEX_T, DIM, DIR>                                         \
-    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
-      inputInfo,                                                        \
-      sliceSize,                                                        \
-      k,                                                                \
-      inputSlices,                                                      \
-      /* The actual dimension that the k-selection is running in */     \
-      /* may have changed from collapseDims() */                        \
-      inputInfo.strides[collapseInputDim],                              \
-      topKInfo,                                                         \
-      topKSlices,                                                       \
-      topKInfo.strides[collapseTopKDim],                                \
-      indicesInfo,                                                      \
-      indicesInfo.strides[collapseIndicesDim])
-
-#define RUN_DIR(INDEX_T, DIM)                   \
-  if (dir) {                                    \
-    RUN_K(INDEX_T, DIM, true);                  \
-  } else {                                      \
-    RUN_K(INDEX_T, DIM, false);                 \
-  }
-
-#define RUN_DIM(INDEX_T)                        \
-  if (allDims == 1) {                           \
-    RUN_DIR(INDEX_T, 1);                        \
-  } else if (allDims == 2) {                    \
-    RUN_DIR(INDEX_T, 2);                        \
-  } else if (allDims == 3) {                    \
-    RUN_DIR(INDEX_T, 3);                        \
-  } else {                                      \
-    RUN_DIR(INDEX_T, -1);                       \
-  }
-
-#define RUN_T(INDEX_T)                                                  \
-  TensorInfo<float, INDEX_T> inputInfo =                                \
-    getTensorInfo<THCudaTensor, INDEX_T>(state, input);                 \
-  TensorInfo<float, INDEX_T> topKInfo =                                 \
-    getTensorInfo<THCudaTensor, INDEX_T>(state, topK);                  \
-  TensorInfo<long, INDEX_T> indicesInfo =                               \
-    getTensorInfo<THCudaLongTensor, INDEX_T>(state, indices);           \
-                                                                        \
-  /* We use these structures solely to find the offset to */            \
-  /* each slice we are operating on */                                  \
-  inputInfo.sizes[dim] = 1;                                             \
-  topKInfo.sizes[dim] = 1;                                              \
-  indicesInfo.sizes[dim] = 1;                                           \
-                                                                        \
-  /* Collapse all other dims */                                         \
-  int collapseInputDim = inputInfo.collapseDims(dim);                   \
-  int collapseTopKDim = topKInfo.collapseDims(dim);                     \
-  int collapseIndicesDim = indicesInfo.collapseDims(dim);               \
-                                                                        \
-  long inputSlices = 1;                                                 \
-  long topKSlices = 1;                                                  \
-  for (int i = 0; i < numDims; ++i) {                                   \
-    inputSlices *= inputInfo.sizes[i];                                  \
-    topKSlices *= topKInfo.sizes[i];                                    \
-  }                                                                     \
-                                                                        \
-  dim3 grid;                                                            \
-  if (!THC_getGridFromTiles(inputSlices, grid)) {                       \
-    THError("Slice to sort is too large");                              \
-  }                                                                     \
-                                                                        \
-  dim3 block(std::min(THCRoundUp(sliceSize, 32L), 1024L));              \
-                                                                        \
-  /* This is used as a template parameter to calculate indices. */      \
-  /* We only specialize it if all collapsed dim sizes are the */        \
-  /* same; otherwise, we use -1 which is the specialization */          \
-  /* parameter for arbitrary dimensions */                              \
-  int allDims = inputInfo.dims;                                         \
-  if (topKInfo.dims != allDims || indicesInfo.dims != allDims) {        \
-    allDims = -1;                                                       \
-  }                                                                     \
-                                                                        \
-  RUN_DIM(INDEX_T);
-
-  // Based on required index size, run the algorithm with the
-  // appropriate index type
-  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input) &&
-      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, topK) &&
-      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
-    RUN_T(unsigned int);
-  } else {
-    RUN_T(unsigned long);
-  }
-#undef RUN_T
-#undef RUN_DIM
-#undef RUN_DIR
-#undef RUN_K
-
-  // Sort the results if the user wants them sorted, since our
-  // selection routine does not ensure sorting
-  if (sorted) {
-    // FIXME: the k/v inplace sort along slice only works for size <=
-    // 2048 at the moment
-    if (sliceSize <= 2048) {
-      // This avoids any memory allocations and performs all sorting
-      // work inplace along the slice
-      THCudaTensor_sortKeyValueInplace(state, topK, indices, dim, dir);
-    } else {
-      // Depend upon the backup sort that returns indices, which we
-      // can use in conjunction with gather to produce the original
-      // indices.
-      // This is not the most efficient implementation, especially since
-      // there are memory allocations performed here. If the user desires
-      // greater performance, they should torch.gather() the results
-      // themselves using the reported indices, providing previously
-      // allocated tensors to receive the results.
-      THCudaTensor* sortedTopK = THCudaTensor_new(state);
-      THCudaLongTensor* sortedIndices = THCudaLongTensor_new(state);
-      THCudaTensor_sort(state, sortedTopK, sortedIndices, topK, dim, dir);
-
-      THCudaLongTensor* sortedTopKIndices = THCudaLongTensor_new(state);
-
-      THCudaLongTensor_resizeAs(state, sortedTopKIndices, indices);
-      THCudaLongTensor_gather(state, sortedTopKIndices, indices, dim, sortedIndices);
-
-      THCudaTensor_freeCopyTo(state, sortedTopK, topK);
-      THCudaLongTensor_freeCopyTo(state, sortedTopKIndices, indices);
-      THCudaLongTensor_free(state, sortedIndices);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
diff --git a/lib/THC/THCTensorTopK.cuh b/lib/THC/THCTensorTopK.cuh
new file mode 100644
index 0000000..7269e99
--- /dev/null
+++ b/lib/THC/THCTensorTopK.cuh
@@ -0,0 +1,485 @@
+#ifndef THC_TENSOR_TOPK_CUH
+#define THC_TENSOR_TOPK_CUH
+
+template <typename T>
+struct TopKTypeConfig {};
+
+template <>
+struct TopKTypeConfig<float> {
+  typedef unsigned int RadixType;
+
+  // Converts a float to an integer representation with the same
+  // sorting; i.e., for floats f1, f2:
+  // if f1 < f2 then convert(f1) < convert(f2)
+  // We use this to enable radix selection of floating-point values.
+  // This also gives a relative order for NaNs, but that's ok, as they
+  // will all be adjacent
+  static inline __device__ RadixType convert(float v) {
+    RadixType x = __float_as_int(v);
+    RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
+
+    return (x ^ mask);
+  }
+
+  static inline __device__ float deconvert(RadixType v) {
+    RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
+
+    return __int_as_float(v ^ mask);
+  }
+};
+
+template <>
+struct TopKTypeConfig<unsigned char> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(unsigned char v) {
+    return v;
+  }
+
+  static inline __device__ unsigned char deconvert(RadixType v) {
+    return v;
+  }
+};
+
+template <>
+struct TopKTypeConfig<char> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(char v) {
+    return 128u + v;
+  }
+
+  static inline __device__ char deconvert(RadixType v) {
+    return v - 128;
+  }
+};
+
+template <>
+struct TopKTypeConfig<short> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(short v) {
+    assert(sizeof(short) == 2);
+    return 32768u + v;
+  }
+
+  static inline __device__ short deconvert(RadixType v) {
+    return v - 32768;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(int v) {
+    assert(sizeof(int) == 4);
+    return 2147483648u + v;
+  }
+
+  static inline __device__ int deconvert(RadixType v) {
+    return v - 2147483648u;
+  }
+};
+
+template <>
+struct TopKTypeConfig<long> {
+  typedef unsigned long long int RadixType;
+
+  static inline __device__ RadixType convert(long v) {
+    assert(sizeof(long) == 8);
+    return 9223372036854775808ull + v;
+  }
+
+  static inline __device__ long deconvert(RadixType v) {
+    return v - 9223372036854775808ull;
+  }
+};
+
+template <>
+struct TopKTypeConfig<double> {
+  typedef unsigned long long int RadixType;
+
+  static inline __device__ RadixType convert(double v) {
+    RadixType x = __double_as_longlong(v);
+    RadixType mask = -((x >> 63)) | 0x8000000000000000;
+    return (x ^ mask);
+  }
+
+  static inline __device__ double deconvert(RadixType v) {
+    RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
+    return __longlong_as_double(v ^ mask);
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TopKTypeConfig<half> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(half v) {
+#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
+    RadixType x = __half_as_ushort(v);
+    RadixType mask = -((x >> 15)) | 0x8000;
+    return (x ^ mask);
+#else
+    assert(false);
+    return 0u;
+#endif
+  }
+
+  static inline __device__ half deconvert(RadixType v) {
+#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
+    RadixType mask = ((v >> 15) - 1) | 0x8000;
+    return __ushort_as_half(v ^ mask);
+#else
+    assert(false);
+    return ScalarConvert<int, half>::to(0);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+// This function counts the distribution of all input values in a
+// slice we are selecting by radix digit at `radixDigitPos`, but only
+// those that pass the filter `((v & desiredMask) == desired)`.
+// This produces and broadcasts the seen counts for a single block only.
+// `smem` must have at least `RadixSize` elements.
+template <typename DataType, typename BitDataType,
+          typename IndexType, typename CountType,
+          int RadixSize, int RadixBits>
+__device__ void countRadixUsingMask(CountType counts[RadixSize],
+                                    CountType* smem,
+                                    BitDataType desired,
+                                    BitDataType desiredMask,
+                                    int radixDigitPos,
+                                    IndexType sliceSize,
+                                    IndexType withinSliceStride,
+                                    DataType* data) {
+  // Clear out per-thread counts from a previous round
+#pragma unroll
+  for (int i = 0; i < RadixSize; ++i) {
+    counts[i] = 0;
+  }
+
+  if (threadIdx.x < RadixSize) {
+    smem[threadIdx.x] = 0;
+  }
+  __syncthreads();
+
+  // Scan over all the data. Upon a read, the warp will accumulate
+  // counts per each digit in the radix using warp voting.
+  for (IndexType i = threadIdx.x; i < sliceSize; i += blockDim.x) {
+    BitDataType val = TopKTypeConfig<DataType>::convert(doLdg(&data[i * withinSliceStride]));
+
+    bool hasVal = ((val & desiredMask) == desired);
+    BitDataType digitInRadix = Bitfield<BitDataType>::getBitfield(val, radixDigitPos, RadixBits);
+
+#pragma unroll
+    for (unsigned int j = 0; j < RadixSize; ++j) {
+      bool vote = hasVal && (digitInRadix == j);
+      counts[j] += __popc(__ballot(vote));
+    }
+  }
+
+  // Now, for each warp, sum values
+  if (getLaneId() == 0) {
+#pragma unroll
+    for (unsigned int i = 0; i < RadixSize; ++i) {
+      atomicAdd(&smem[i], counts[i]);
+    }
+  }
+
+  __syncthreads();
+
+  // For each thread, read in the total counts
+#pragma unroll
+  for (unsigned int i = 0; i < RadixSize; ++i) {
+    counts[i] = smem[i];
+  }
+
+  __syncthreads();
+}
+
+// Over what radix we are selecting values
+#define RADIX_BITS 2 // digits are base-(2 ^ RADIX_BITS)
+#define RADIX_SIZE 4 // 2 ^ RADIX_BITS
+#define RADIX_MASK (RADIX_SIZE - 1)
+
+// This finds the unique value `v` that matches the pattern
+// ((v & desired) == desiredMask) in our sorted int format
+template <typename DataType, typename BitDataType, typename IndexType>
+__device__ DataType findPattern(DataType* smem,
+                             DataType* data,
+                             IndexType sliceSize,
+                             IndexType withinSliceStride,
+                             BitDataType desired,
+                             BitDataType desiredMask) {
+  if (threadIdx.x < 32) {
+    smem[threadIdx.x] = ScalarConvert<int, DataType>::to(0);
+  }
+  __syncthreads();
+
+  // All threads participate in the loop, in order to sync on the flag
+  IndexType numIterations = THCRoundUp(sliceSize, (IndexType) blockDim.x);
+  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < sliceSize);
+    DataType v = inRange ? doLdg(&data[i * withinSliceStride]) : ScalarConvert<int, DataType>::to(0);
+
+    if (inRange && ((TopKTypeConfig<DataType>::convert(v) & desiredMask) == desired)) {
+      // There should not be conflicts if we are using findPattern,
+      // since the result is unique
+      smem[0] = ScalarConvert<int, DataType>::to(1);
+      smem[1] = v; // can't use val as the flag, since it could be 0
+    }
+
+    __syncthreads();
+
+    DataType found = smem[0];
+    DataType val = smem[1];
+
+    __syncthreads();
+
+    // Check to see if a thread found the value
+    if (THCNumerics<DataType>::ne(found, ScalarConvert<int, DataType>::to(0))) {
+      // all threads return this value
+      return val;
+    }
+  }
+
+  // should not get here
+  assert(false);
+  return ScalarConvert<int, DataType>::to(0);
+}
+
+// Returns the top-Kth element found in the data using radix selection
+template <typename DataType, typename BitDataType, typename IndexType, bool Order>
+__device__ void radixSelect(DataType* data,
+                            IndexType k,
+                            IndexType sliceSize,
+                            IndexType withinSliceStride,
+                            int* smem,
+                            DataType* topK) {
+  // Per-thread buckets into which we accumulate digit counts in our
+  // radix
+  int counts[RADIX_SIZE];
+
+  // We only consider elements x such that (x & desiredMask) == desired
+  // Initially, we consider all elements of the array, so the above
+  // statement is true regardless of input.
+  BitDataType desired = 0;
+  BitDataType desiredMask = 0;
+
+  // We are looking for the top kToFind-th element when iterating over
+  // digits; this count gets reduced by elimination when counting
+  // successive digits
+  int kToFind = k;
+
+  // We start at the most significant digit in our radix, scanning
+  // through to the least significant digit
+#pragma unroll
+  for (int digitPos = sizeof(DataType) * 8 - RADIX_BITS;
+       digitPos >= 0;
+       digitPos -= RADIX_BITS) {
+
+    // Count radix distribution for the current position and reduce
+    // across all threads
+    countRadixUsingMask<DataType, BitDataType,
+                        IndexType, int,
+                        RADIX_SIZE, RADIX_BITS>(
+                          counts, smem,
+                          desired, desiredMask, digitPos,
+                          sliceSize, withinSliceStride, data);
+
+    // All threads participate in the comparisons below to know the
+    // final result
+
+
+#define CHECK_RADIX(i)                                                  \
+    int count = counts[i];                                              \
+                                                                        \
+    /* All threads have the same value in counts here, so all */        \
+    /* threads will return from the function. */                        \
+    if (count == 1 && kToFind == 1) {                                   \
+      /* There is a unique answer. */                                   \
+      desired = Bitfield<BitDataType>::setBitfield(desired, i, digitPos, RADIX_BITS);          \
+      desiredMask =                                                     \
+        Bitfield<BitDataType>::setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS);     \
+                                                                        \
+      /* The answer is now the unique element v such that: */           \
+      /* (v & desiredMask) == desired */                                \
+      /* However, we do not yet know what the actual element is. We */  \
+      /* need to perform a search through the data to find the */       \
+      /* element that matches this pattern. */                          \
+      *topK = findPattern<DataType, BitDataType, IndexType>(                         \
+        (DataType*) smem, data, sliceSize,                              \
+        withinSliceStride, desired, desiredMask);                       \
+      return;                                                           \
+    }                                                                   \
+                                                                        \
+    if (count >= kToFind) {                                             \
+      desired = Bitfield<BitDataType>::setBitfield(desired, i, digitPos, RADIX_BITS);          \
+      desiredMask =                                                     \
+        Bitfield<BitDataType>::setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS);     \
+                                                                        \
+      /* The top-Kth element v must now be one such that: */            \
+      /* (v & desiredMask == desired) */                                \
+      /* but we haven't narrowed it down; we must check the next */     \
+      /* least-significant digit */                                     \
+      break;                                                            \
+    }                                                                   \
+                                                                        \
+    kToFind -= count                                                    \
+
+    if (Order) {
+      // Process in descending order
+#pragma unroll
+      for (int i = RADIX_SIZE - 1; i >= 0; --i) {
+        CHECK_RADIX(i);
+      }
+    } else {
+      // Process in ascending order
+#pragma unroll
+      for (int i = 0; i < RADIX_SIZE; ++i) {
+        CHECK_RADIX(i);
+      }
+    }
+#undef CHECK_RADIX
+  } // end digitPos for
+
+  // There is no unique result, but there is a non-unique result
+  // matching `desired` exactly
+  *topK = TopKTypeConfig<DataType>::deconvert(desired);
+}
+
+template <typename T, typename IndexType, int Dim, bool Order>
+__global__ void gatherTopK(TensorInfo<T, IndexType> input,
+                           IndexType inputSliceSize,
+                           IndexType outputSliceSize, // aka `k`
+
+                           IndexType numInputSlices,
+                           IndexType inputWithinSliceStride,
+
+                           TensorInfo<T, IndexType> topK,
+                           IndexType numTopKSlices,
+                           IndexType topKWithinSliceStride,
+
+                           TensorInfo<long, IndexType> indices,
+                           IndexType indicesWithinSliceStride) {
+  // Indices are limited to integer fp precision, so counts can fit in
+  // int32, regardless of IndexType
+  __shared__ int smem[32]; // one per each warp, up to warp limit
+
+  IndexType slice = getLinearBlockId<IndexType>();
+  if (slice >= numInputSlices) {
+    return;
+  }
+
+  // Find the start offset for our slice
+  IndexType sliceStartIndex =
+    IndexToOffset<T, IndexType, Dim>::get(slice, input);
+  IndexType topKSliceStartIndex =
+    IndexToOffset<T, IndexType, Dim>::get(slice, topK);
+  IndexType indicesSliceStartIndex =
+    IndexToOffset<long, IndexType, Dim>::get(slice, indices);
+
+  T* inputSliceStart = &input.data[sliceStartIndex];
+  T* topKSliceStart = &topK.data[topKSliceStartIndex];
+  long* indicesSliceStart = &indices.data[indicesSliceStartIndex];
+
+  // Find the k-th highest element in our input
+  T topKValue = ScalarConvert<int, T>::to(0);
+  radixSelect<T, typename TopKTypeConfig<T>::RadixType, IndexType, Order>(
+    inputSliceStart, outputSliceSize,
+    inputSliceSize, inputWithinSliceStride,
+    smem, &topKValue);
+
+  // Every value that is strictly less/greater than `pattern`
+  // (depending on sort dir) in sorted int format is in the top-K.
+  // The top-K value itself might not be unique.
+  //
+  // Since there are a variable number of elements that we see that
+  // are within the top-k, we don't know at what index to write out
+  // the resulting values.
+  // In order to get this, we perform an exclusive prefix sum of
+  // `hasTopK`. This will return the resulting index into which we
+  // need to write the result, if a thread has a result.
+
+  // All threads need to participate in the loop and the prefix sum,
+  // but not necessarily in the load; hence loop bounds being rounded
+  // up to a multiple of the block dim.
+  IndexType numIterations = THCRoundUp(inputSliceSize, (IndexType) blockDim.x);
+  IndexType writeIndexStart = 0;
+
+  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    T v =
+      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
+    bool hasTopK;
+    if (Order) {
+      hasTopK = inRange && (THCNumerics<T>::gt(v, topKValue));
+    } else {
+      hasTopK = inRange && (THCNumerics<T>::lt(v, topKValue));
+    }
+
+    int index;
+    int carry;
+    exclusiveBinaryPrefixScan<int, true>(smem, hasTopK, &index, &carry, AddOp<int>());
+
+    if (hasTopK) {
+      int writeIndex = writeIndexStart + index;
+      assert(writeIndex < outputSliceSize);
+
+      IndexType topKOffset = writeIndex * topKWithinSliceStride;
+      IndexType indexOffset = writeIndex * indicesWithinSliceStride;
+
+      topKSliceStart[topKOffset] = v;
+      indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
+    }
+
+    writeIndexStart += carry;
+  }
+
+  // We need to fill in the rest with actual == top-K values.
+  // The number that we need is outputSliceSize -
+  // writeIndexStart. There might be more than that number available,
+  // in which case we have to choose the first seen set. We do this
+  // via a prefix sum to calculate indices for writing results.
+  assert(outputSliceSize >= writeIndexStart);
+  IndexType topKRemaining = (outputSliceSize - writeIndexStart);
+
+  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    T v =
+      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
+    bool hasTopK = inRange && (THCNumerics<T>::eq(v, topKValue));
+
+    int index;
+    int carry;
+    exclusiveBinaryPrefixScan<int, true>(smem, hasTopK, &index, &carry, AddOp<int>());
+
+    if (hasTopK && index < topKRemaining) {
+      int writeIndex = writeIndexStart + index;
+      assert(writeIndex < outputSliceSize);
+
+      IndexType topKOffset = writeIndex * topKWithinSliceStride;
+      IndexType indexOffset = writeIndex * indicesWithinSliceStride;
+
+      topKSliceStart[topKOffset] = v;
+      indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
+    }
+
+    if (carry >= topKRemaining) {
+      break;
+    }
+
+    topKRemaining -= carry;
+    writeIndexStart += carry;
+  }
+}
+
+#undef RADIX_BITS
+#undef RADIX_SIZE
+#undef RADIX_MASK
+
+#endif // THC_TENSOR_TOPK_CUH
diff --git a/lib/THC/THCTensorTopK.h b/lib/THC/THCTensorTopK.h
deleted file mode 100644
index 711c047..0000000
--- a/lib/THC/THCTensorTopK.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef TH_CUDA_TENSOR_TOPK_INC
-#define TH_CUDA_TENSOR_TOPK_INC
-
-#include "THCTensor.h"
-
-/* Returns the set of all kth smallest (or largest) elements, depending */
-/* on `dir` */
-THC_API void THCudaTensor_topk(THCState* state,
-                               THCudaTensor* topK,
-                               THCudaLongTensor* indices,
-                               THCudaTensor* input,
-                               long k, int dim, int dir, int sorted);
-
-#endif
diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu
index bdcbcbe..e4c1c34 100644
--- a/lib/THC/THCTensorTypeUtils.cu
+++ b/lib/THC/THCTensorTypeUtils.cu
@@ -73,6 +73,14 @@ TensorUtils<TENSOR_TYPE>::resizeAs(THCState* state,                     \
   TENSOR_TYPE##_resizeAs(state, dst, src);                              \
 }                                                                       \
                                                                         \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::squeeze1d(THCState *state,                    \
+                                    TENSOR_TYPE *dst,                   \
+                                    TENSOR_TYPE *src,                   \
+                                    int dimension) {                    \
+  TENSOR_TYPE##_squeeze1d(state, dst, src, dimension);                  \
+}                                                                       \
+                                                                        \
 DATA_TYPE*                                                              \
 TensorUtils<TENSOR_TYPE>::getData(THCState* state,                      \
                                   TENSOR_TYPE* t) {                     \
diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh
index 273606e..37edb76 100644
--- a/lib/THC/THCTensorTypeUtils.cuh
+++ b/lib/THC/THCTensorTypeUtils.cuh
@@ -49,6 +49,8 @@ struct TensorUtils {
                        THLongStorage* strides);                         \
     static void resizeAs(THCState* state, TENSOR_TYPE* dst,             \
                          TENSOR_TYPE* src);                             \
+    static void squeeze1d(THCState *state, TENSOR_TYPE *dst,            \
+                          TENSOR_TYPE *src, int dimension);             \
     static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t);         \
     static ptrdiff_t getNumElements(THCState* state, TENSOR_TYPE* t);        \
     static long getSize(THCState* state, TENSOR_TYPE* t, int dim);      \
diff --git a/lib/THC/generic/THCTensor.c b/lib/THC/generic/THCTensor.c
index 1770535..c227032 100644
--- a/lib/THC/generic/THCTensor.c
+++ b/lib/THC/generic/THCTensor.c
@@ -65,7 +65,6 @@ void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag)
 /**** creation methods ****/
 
 static void THCTensor_(rawInit)(THCState *state, THCTensor *self);
-static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride);
 
 
 /* Empty init */
@@ -81,13 +80,13 @@ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
 {
   THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
   THCTensor_(rawInit)(state, self);
-  THCTensor_(rawSet)(state,
-                      self,
-                      tensor->storage,
-                      tensor->storageOffset,
-                      tensor->nDimension,
-                      tensor->size,
-                      tensor->stride);
+  THCTensor_(setStorageNd)(state,
+                           self,
+                           tensor->storage,
+                           tensor->storageOffset,
+                           tensor->nDimension,
+                           tensor->size,
+                           tensor->stride);
   return self;
 }
 
@@ -99,13 +98,13 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrd
     THArgCheck(size->size == stride->size, 4, "inconsistent size");
 
   THCTensor_(rawInit)(state, self);
-  THCTensor_(rawSet)(state,
-                      self,
-                      storage,
-                      storageOffset,
-                      (size ? size->size : (stride ? stride->size : 0)),
-                      (size ? size->data : NULL),
-                      (stride ? stride->data : NULL));
+  THCTensor_(setStorageNd)(state,
+                           self,
+                           storage,
+                           storageOffset,
+                           (size ? size->size : (stride ? stride->size : 0)),
+                           (size ? size->data : NULL),
+                           (stride ? stride->data : NULL));
 
   return self;
 }
@@ -141,7 +140,7 @@ THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage, pt
 
   THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
   THCTensor_(rawInit)(state, self);
-  THCTensor_(rawSet)(state, self, storage, storageOffset, 4, size, stride);
+  THCTensor_(setStorageNd)(state, self, storage, storageOffset, 4, size, stride);
 
   return self;
 }
@@ -172,7 +171,7 @@ THCTensor *THCTensor_(newWithSize4d)(THCState *state, long size0, long size1, lo
 
   THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
   THCTensor_(rawInit)(state, self);
-  THCTensor_(rawResize)(state, self, 4, size, NULL);
+  THCTensor_(resizeNd)(state, self, 4, size, NULL);
 
   return self;
 }
@@ -224,6 +223,17 @@ THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimensi
   return self;
 }
 
+THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage *size)
+{
+  THArgCheck(THCTensor_(isContiguous)(state, tensor), 2, "input is not contiguous");
+  ptrdiff_t numel = THCTensor_(nElement)(state, tensor);
+  THCTensor *self = THCTensor_(new)(state);
+  THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
+  THCTensor_(setStorage)(state, self, tensor->storage, tensor->storageOffset, inferred_size, NULL);
+  THLongStorage_free(inferred_size);
+  return self;
+}
+
 /* Resize */
 void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, THLongStorage *stride)
 {
@@ -231,7 +241,7 @@ void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, T
   if(stride)
     THArgCheck(stride->size == size->size, 3, "invalid stride");
 
-  THCTensor_(rawResize)(state, self, size->size, size->data, (stride ? stride->data : NULL));
+  THCTensor_(resizeNd)(state, self, size->size, size->data, (stride ? stride->data : NULL));
 }
 
 void THCTensor_(resizeAs)(THCState *state, THCTensor *self, THCTensor *src)
@@ -252,7 +262,7 @@ void THCTensor_(resizeAs)(THCState *state, THCTensor *self, THCTensor *src)
   }
 
   if(!isSame)
-    THCTensor_(rawResize)(state, self, src->nDimension, src->size, NULL);
+    THCTensor_(resizeNd)(state, self, src->nDimension, src->size, NULL);
 }
 
 void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, long size0)
@@ -274,26 +284,48 @@ void THCTensor_(resize4d)(THCState *state, THCTensor *self, long size0, long siz
 {
   long size[4] = {size0, size1, size2, size3};
 
-  THCTensor_(rawResize)(state, self, 4, size, NULL);
+  THCTensor_(resizeNd)(state, self, 4, size, NULL);
 }
 
 void THCTensor_(resize5d)(THCState *state, THCTensor *self, long size0, long size1, long size2, long size3, long size4)
 {
     long size[5] = {size0, size1, size2, size3, size4};
 
-  THCTensor_(rawResize)(state, self, 5, size, NULL);
+  THCTensor_(resizeNd)(state, self, 5, size, NULL);
+}
+
+THCTensor* THCTensor_(newExpand)(THCState *state, THCTensor *tensor, THLongStorage *sizes) {
+  THArgCheck(THLongStorage_size(sizes) >= THCTensor_(nDimension)(state, tensor), 1, "the number of sizes provided \
+      must be greater or equal to the number of dimensions in the tensor");
+  THArgCheck(THCTensor_(nDimension)(state, tensor) > 0, 0, "can't expand an empty tensor");
+
+  long *expandedSizes;
+  long *expandedStrides;
+  THLongStorage_calculateExpandGeometry(tensor->size,
+                                        tensor->stride,
+                                        THCTensor_(nDimension)(state, tensor),
+                                        sizes,
+                                        &expandedSizes,
+                                        &expandedStrides);
+
+  THCTensor *result = THCTensor_(new)(state);
+  THCTensor_(setStorageNd)(state, result, THCTensor_(storage)(state, tensor), THCTensor_(storageOffset)(state, tensor), THLongStorage_size(sizes), expandedSizes, expandedStrides);
+  THFree(expandedSizes);
+  THFree(expandedStrides);
+
+  return result;
 }
 
 void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src)
 {
   if(self != src)
-    THCTensor_(rawSet)(state,
-                        self,
-                        src->storage,
-                        src->storageOffset,
-                        src->nDimension,
-                        src->size,
-                        src->stride);
+    THCTensor_(setStorageNd)(state,
+                             self,
+                             src->storage,
+                             src->storageOffset,
+                             src->nDimension,
+                             src->size,
+                             src->stride);
 }
 
 void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
@@ -301,13 +333,13 @@ void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storag
   if(size_ && stride_)
     THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes");
 
-  THCTensor_(rawSet)(state,
-                      self,
-                      storage_,
-                      storageOffset_,
-                      (size_ ? size_->size : (stride_ ? stride_->size : 0)),
-                      (size_ ? size_->data : NULL),
-                      (stride_ ? stride_->data : NULL));
+  THCTensor_(setStorageNd)(state,
+                           self,
+                           storage_,
+                           storageOffset_,
+                           (size_ ? size_->size : (stride_ ? stride_->size : 0)),
+                           (size_ ? size_->data : NULL),
+                           (stride_ ? stride_->data : NULL));
 }
 
 void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
@@ -353,7 +385,7 @@ void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *stor
   long size[4] = {size0_, size1_, size2_, size3_};
   long stride[4] = {stride0_, stride1_, stride2_, stride3_};
 
-  THCTensor_(rawSet)(state, self, storage_, storageOffset_, 4, size, stride);
+  THCTensor_(setStorageNd)(state, self, storage_, storageOffset_, 4, size, stride);
 }
 
 
@@ -517,6 +549,33 @@ void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int
   }
 }
 
+void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 3, "dimension out of range");
+  THArgCheck(src->nDimension > 0, 3, "cannot unsqueeze empty tensor");
+
+  THCTensor_(set)(state, self, src);
+
+  self->size = (long*)THRealloc(self->size, sizeof(long)*(self->nDimension+1));
+  self->stride = (long*)THRealloc(self->stride, sizeof(long)*(self->nDimension+1));
+  self->nDimension++;
+  for (d = self->nDimension-1; d > dimension; d--) {
+    self->size[d] = self->size[d-1];
+    self->stride[d] = self->stride[d-1];
+  }
+  if (dimension+1 < self->nDimension) {
+    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+  } else {
+    self->stride[dimension] = 1;
+  }
+  self->size[dimension] = 1;
+}
+
 int THCTensor_(isContiguous)(THCState *state, const THCTensor *self)
 {
   long z = 1;
@@ -637,7 +696,7 @@ static void THCTensor_(rawInit)(THCState *state, THCTensor *self)
   self->flag = TH_TENSOR_REFCOUNTED;
 }
 
-static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
+void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
 {
   /* storage */
   if(self->storage != storage)
@@ -660,10 +719,10 @@ static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *sto
   self->storageOffset = storageOffset;
 
   /* size and stride */
-  THCTensor_(rawResize)(state, self, nDimension, size, stride);
+  THCTensor_(resizeNd)(state, self, nDimension, size, stride);
 }
 
-void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride)
+void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride)
 {
   int d;
   int nDimension_;
diff --git a/lib/THC/generic/THCTensor.cu b/lib/THC/generic/THCTensor.cu
index 29561ca..8f13a7d 100644
--- a/lib/THC/generic/THCTensor.cu
+++ b/lib/THC/generic/THCTensor.cu
@@ -4,7 +4,7 @@
 
 cudaTextureObject_t THCTensor_(getTextureObject)(THCState *state, THCTensor *self)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   cudaTextureObject_t texObj;
   struct cudaResourceDesc resDesc;
   memset(&resDesc, 0, sizeof(resDesc));
diff --git a/lib/THC/generic/THCTensor.h b/lib/THC/generic/THCTensor.h
index 9cd4807..9c5d5be 100644
--- a/lib/THC/generic/THCTensor.h
+++ b/lib/THC/generic/THCTensor.h
@@ -66,6 +66,8 @@ THC_API THCTensor *THCTensor_(newSelect)(THCState *state, THCTensor *tensor, int
 THC_API THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int dimension_, long firstIndex_, long size_);
 THC_API THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_);
 THC_API THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimension_, long size_, long step_);
+THC_API THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage *size);
+THC_API THCTensor *THCTensor_(newExpand)(THCState *state, THCTensor *tensor, THLongStorage *size);
 
 THC_API void THCTensor_(resize)(THCState *state, THCTensor *tensor, THLongStorage *size, THLongStorage *stride);
 THC_API void THCTensor_(resizeAs)(THCState *state, THCTensor *tensor, THCTensor *src);
@@ -74,10 +76,11 @@ THC_API void THCTensor_(resize2d)(THCState *state, THCTensor *tensor, long size0
 THC_API void THCTensor_(resize3d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_);
 THC_API void THCTensor_(resize4d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_, long size3_);
 THC_API void THCTensor_(resize5d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_, long size3_, long size4_);
-THC_API void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride);
+THC_API void THCTensor_(resizeNd)(THCState *state, THCTensor *tensor, int nDimension, long *size, long *stride);
 
 THC_API void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+THC_API void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride);
 THC_API void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
                                     long size0_, long stride0_);
 THC_API void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
@@ -100,6 +103,7 @@ THC_API void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src
 
 THC_API void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
+THC_API void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
 
 THC_API int THCTensor_(isContiguous)(THCState *state, const THCTensor *self);
 THC_API int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor *src);
diff --git a/lib/THC/generic/THCTensorCopy.c b/lib/THC/generic/THCTensorCopy.c
index 874a71e..b5122b8 100644
--- a/lib/THC/generic/THCTensorCopy.c
+++ b/lib/THC/generic/THCTensorCopy.c
@@ -118,12 +118,12 @@ void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor
     THCudaCheck(cudaSetDevice(tensorDevice));
   }
 
-  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCStream *stream  = THCState_getStream(state);
   THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, self),
                               THTensor_(data)(src),
                               THTensor_(nElement)(src) * sizeof(real),
                               cudaMemcpyHostToDevice,
-                              stream));
+                              stream->stream));
 
   THCudaCheck(THCCachingHostAllocator_recordEvent(src->storage->data, stream));
 
@@ -149,12 +149,12 @@ void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor
     THCudaCheck(cudaSetDevice(tensorDevice));
   }
 
-  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCStream *stream = THCState_getStream(state);
   THCudaCheck(cudaMemcpyAsync(THTensor_(data)(self),
                               THCTensor_(data)(state, src),
                               THCTensor_(nElement)(state, src) * sizeof(real),
                               cudaMemcpyDeviceToHost,
-                              stream));
+                              stream->stream));
 
   THCudaCheck(THCCachingHostAllocator_recordEvent(src->storage->data, stream));
 
diff --git a/lib/THC/generic/THCTensorIndex.cu b/lib/THC/generic/THCTensorIndex.cu
index ce4c790..e388cdd 100644
--- a/lib/THC/generic/THCTensorIndex.cu
+++ b/lib/THC/generic/THCTensorIndex.cu
@@ -4,7 +4,7 @@
 
 void THCTensor_(indexCopy_long)(THCState *state, THCTensor *dst, int dim, THLongTensor *indices, THCTensor *src)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
 
   THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
   THCudaLongTensor_copyLong(state, indices_, indices);
@@ -16,8 +16,8 @@ void THCTensor_(indexCopy_long)(THCState *state, THCTensor *dst, int dim, THLong
 
 void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, THCTensor *src)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
-  THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
 
   long dims = THCTensor_(nDimension)(state, dst);
   THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
@@ -132,7 +132,7 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT
 
 void THCTensor_(indexAdd_long)(THCState *state, THCTensor *dst, int dim, THLongTensor *indices, THCTensor *src)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
 
   THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
   THCudaLongTensor_copyLong(state, indices_, indices);
@@ -144,8 +144,8 @@ void THCTensor_(indexAdd_long)(THCState *state, THCTensor *dst, int dim, THLongT
 
 void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, THCTensor *src)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
-  THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
 
   long dims = THCTensor_(nDimension)(state, dst);
   THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
@@ -260,7 +260,7 @@ void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTe
 
 void THCTensor_(indexFill_long)(THCState *state, THCTensor *dst, int dim, THLongTensor *indices, real val)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, dst));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, dst));
 
   THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
   THCudaLongTensor_copyLong(state, indices_, indices);
@@ -272,8 +272,8 @@ void THCTensor_(indexFill_long)(THCState *state, THCTensor *dst, int dim, THLong
 
 void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, real val)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, dst));
-  THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, dst));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
   long dims = THCTensor_(nDimension)(state, dst);
   THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
   dims = THCudaLongTensor_nDimension(state, indices);
@@ -374,7 +374,7 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT
 
 void THCTensor_(indexSelect_long)(THCState *state, THCTensor *dst, THCTensor *src, int dim, THLongTensor *indices)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
   THArgCheck(indices->nDimension == 1, 3, "Index is supposed to be a vector");
 
   THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
@@ -387,7 +387,7 @@ void THCTensor_(indexSelect_long)(THCState *state, THCTensor *dst, THCTensor *sr
 
 void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, int dim, THCudaLongTensor *indices)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, dst, src, indices));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, dst, src, indices));
 
   long dims = THCTensor_(nDimension)(state, dst);
   THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
diff --git a/lib/THC/generic/THCTensorMasked.cu b/lib/THC/generic/THCTensorMasked.cu
index 05d9360..c15edd4 100644
--- a/lib/THC/generic/THCTensorMasked.cu
+++ b/lib/THC/generic/THCTensorMasked.cu
@@ -7,7 +7,7 @@ THC_API void
 THCTensor_(maskedFill)(THCState* state,
                        THCTensor *tensor, THCudaByteTensor *mask, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, tensor, mask));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, mask));
   THArgCheck(THCTensor_(nElement)(state, tensor) ==
              THCudaByteTensor_nElement(state, mask),
              2, "sizes do not match");
@@ -24,7 +24,7 @@ THC_API void
 THCTensor_(maskedFillByte)(THCState* state,
                            THCTensor *tensor, THByteTensor *mask, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, tensor));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor));
   THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
   THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
   THLongStorage_free(maskSizes);
@@ -37,7 +37,7 @@ THC_API void
 THCTensor_(maskedCopy)(THCState* state,
                        THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
   ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask);
   ptrdiff_t tensorSize = THCTensor_(nElement)(state, tensor);
   ptrdiff_t srcSize = THCTensor_(nElement)(state, src);
@@ -104,7 +104,7 @@ THCTensor_(maskedCopy)(THCState* state,
 THC_API void
 THCTensor_(maskedCopyByte)(THCState* state,
                            THCTensor *tensor, THByteTensor *mask, THCTensor *src) {
-  THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
   THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
   THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
   THLongStorage_free(maskSizes);
@@ -116,7 +116,7 @@ THCTensor_(maskedCopyByte)(THCState* state,
 THC_API void
 THCTensor_(maskedSelect)(THCState* state,
                          THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) {
-  THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
   THArgCheck(THCudaByteTensor_nElement(state, mask) ==
              THCTensor_(nElement)(state, src),
              2, "sizes do not match");
@@ -181,7 +181,7 @@ THC_API void
 THCTensor_(maskedSelectByte)(THCState* state,
                              THCTensor *tensor, THCTensor *src, THByteTensor *mask)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
   THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
   THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
   THLongStorage_free(maskSizes);
diff --git a/lib/THC/generic/THCTensorMath.cu b/lib/THC/generic/THCTensorMath.cu
index 46746f7..0eed5a9 100644
--- a/lib/THC/generic/THCTensorMath.cu
+++ b/lib/THC/generic/THCTensorMath.cu
@@ -5,7 +5,7 @@
 THC_API void
 THCTensor_(fill)(THCState* state, THCTensor *self_, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
 
   if (!THC_pointwiseApply1(
         state, self_, TensorFillOp<real>(value))) {
@@ -18,7 +18,7 @@ THCTensor_(fill)(THCState* state, THCTensor *self_, real value)
 THC_API void
 THCTensor_(zero)(THCState *state, THCTensor *self_)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   if (THCTensor_(isContiguous)(state, self_)) {
     THCudaCheck(cudaMemsetAsync(THCTensor_(data)(state, self_),
                                 0,
@@ -38,7 +38,7 @@ THCTensor_(zero)(THCState *state, THCTensor *self_)
 THC_API void
 THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
   THCTensor_(resize)(state, r_, size, NULL);
   THCTensor_(zero)(state, r_);
 }
@@ -46,7 +46,7 @@ THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size)
 THC_API void
 THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
   THCTensor_(resize)(state, r_, size, NULL);
   THCTensor_(fill)(state, r_, ScalarConvert<int, real>::to(1));
 }
@@ -54,7 +54,7 @@ THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size)
 THC_API void
 THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, r_, t));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, t));
   THCTensor_(resize)(state, r_, size, NULL);
   THCTensor_(copy)(state, r_, t);
 }
@@ -87,8 +87,8 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
   // loop below will overwrite the value
   int maxDim = dimension + 1;
 
-  // ldimension is the actual dimension we cat along (minus 1, for 0-based indexing)
-  int ldimension = dimension;
+  // cat_dimension is the actual dimension we cat along
+  int cat_dimension = dimension;
 
   for (i = 0; i < numInputs; i++)
   {
@@ -100,13 +100,13 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
   // In the event that the user specified -1 as the concat dimension, then
   // we want to pick the maxDim  as dimension to cat along (and thus maxDim - 1 as the
   // value due to 0-based indexing). If the maxDim is // 0 (i.e. we are catting all
-  // empty tensors), then we set ldimension to be 0
+  // empty tensors), then we set cat_dimension to be 0
   if (dimension + TH_INDEX_BASE == -1) {
-    ldimension = maxDim ? (maxDim - 1) : 0;
+    cat_dimension = maxDim ? (maxDim - 1) : 0;
   }
 
   THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
-  THArgCheck(ldimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
+  THArgCheck(cat_dimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
 
   size = THLongStorage_newWithSize(maxDim);
   for(i = 0; i < maxDim; i++)
@@ -115,7 +115,7 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
     long dimSize = i < THCTensor_(nDimension)(state, inputs[0])
                        ? THCTensor_(size)(state, inputs[0], i)
                        : THMin(THCTensor_(nDimension)(state, inputs[0]), 1);
-    if (i == ldimension)
+    if (i == cat_dimension)
     {
       for (j = 1; j < numInputs; j++)
       {
@@ -175,23 +175,9 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
     real *data = THCTensor_(data)(state, result);
 
     // Kernel Parameter
-    CatArrInputTensor<real, unsigned int> stackInputs[CAT_ARRAY_BATCH_SIZE];
-    CatArrInputTensor<real, unsigned int> *d_inputs;
-
-    // Attempt to re-use stream's scratch space for the input metadata
-    bool usedScratch = false;
     size_t tensorMetadataSize = sizeof(CatArrInputTensor<real, unsigned int>) * CAT_ARRAY_BATCH_SIZE;
-    if (THCState_getCurrentDeviceScratchSpaceSize(state) > tensorMetadataSize) {
-      void* space = THCState_getCurrentDeviceScratchSpace(state);
-      if (space) {
-        d_inputs = (CatArrInputTensor<real, unsigned int> *) space;
-        usedScratch = true;
-      }
-    }
-    if (!usedScratch) {
-      // Fallback to allocating GPU memory
-      THCudaCheck(THCudaMalloc(state, (void**) &d_inputs, tensorMetadataSize));
-    }
+    CatArrInputTensor<real, unsigned int> *d_inputs;
+    THCudaCheck(THCudaMalloc(state, (void**) &d_inputs, tensorMetadataSize));
 
     OutputTensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> param;
 
@@ -201,17 +187,21 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
       param.outputStride[i] = THCTensor_(stride)(state, result, i);
     }
 
+    THCStream* stream = THCState_getStream(state);
+
     // Template Declarations for dim = 1, 2, 3, 4
 #define HANDLE_CASE(DIMS) \
-  CatArrayBatchedCopy<real, unsigned int, DIMS><<<applyGrid, applyBlock>>>(data, d_inputs, param, ldimension, param.outputStride[dimension]);
+  CatArrayBatchedCopy<real, unsigned int, DIMS><<<applyGrid, applyBlock, 0, stream->stream>>>(data, d_inputs, param, cat_dimension, param.outputStride[cat_dimension]);
 
     // Now we loop
     offset = 0;
     for (i = 0; i < numInputs; i += CAT_ARRAY_BATCH_SIZE) {
+      // Re-allocate stackInputs every iteration to avoid read-after-write hazard
+      CatArrInputTensor<real, unsigned int>* stackInputs = (CatArrInputTensor<real, unsigned int>*) THCudaHostAlloc(state, tensorMetadataSize);
       cohortMax = 0;
       for (j = 0; j < CAT_ARRAY_BATCH_SIZE && (i+j) < numInputs; ++j) {
-        long dimSize = ldimension < THCTensor_(nDimension)(state, inputs[i+j])
-          ? THCTensor_(size)(state, inputs[i+j], ldimension)
+        long dimSize = cat_dimension < THCTensor_(nDimension)(state, inputs[i+j])
+          ? THCTensor_(size)(state, inputs[i+j], cat_dimension)
           : 1;
 
         stackInputs[j].input = THCTensor_(data)(state, inputs[i+j]);
@@ -223,7 +213,14 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
         // update offset
         offset += dimSize;
       }
-      cudaMemcpy(d_inputs, stackInputs, j * sizeof(CatArrInputTensor<real, unsigned int>), cudaMemcpyHostToDevice);
+      THCudaCheck(cudaMemcpyAsync(
+          d_inputs,
+          stackInputs,
+          j * sizeof(CatArrInputTensor<real, unsigned int>),
+          cudaMemcpyHostToDevice,
+          stream->stream));
+      THCudaHostRecord(state, stackInputs);
+      THCudaHostFree(state, stackInputs);
 
       // Next, let's consider how we set our kernel launch parameters.
       // We borrow from THCApply, which the kernel's internal indexing
@@ -256,9 +253,7 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
       }
       THCudaCheck(cudaGetLastError());
     }
-    if (!usedScratch) {
-      THCudaCheck(THCudaFree(state, (void *)d_inputs));
-    }
+    THCudaCheck(THCudaFree(state, d_inputs));
 #undef HANDLE_CASE
   } else {
     offset = 0;
@@ -267,12 +262,12 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
       // No reason to copy when input is empty
       if (!THCTensor_(nDimension)(state, inputs[j])) continue;
 
-      long dimSize = ldimension < THCTensor_(nDimension)(state, inputs[j])
-               ? THCTensor_(size)(state, inputs[j], ldimension)
+      long dimSize = cat_dimension < THCTensor_(nDimension)(state, inputs[j])
+               ? THCTensor_(size)(state, inputs[j], cat_dimension)
                : 1;
 
       THCTensor *nt = THCTensor_(newWithTensor)(state, result);
-      THCTensor_(narrow)(state, nt, NULL, ldimension, offset, dimSize);
+      THCTensor_(narrow)(state, nt, NULL, cat_dimension, offset, dimSize);
       THCTensor_(copy)(state, nt, inputs[j]);
       THCTensor_(free)(state, nt);
       offset += dimSize;
@@ -283,8 +278,8 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
 void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor,
                           THCTensor *self)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self  ));
-  THAssert(THCudaLongTensor_checkGPU(state, 1, tensor));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self  ));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, tensor));
 
 
   using namespace thrust::placeholders;
@@ -348,7 +343,7 @@ void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor,
 }
 
 void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, long k){
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   int nDimension = THCTensor_(nDimension)(state, src_);
   THArgCheck((nDimension == 2) || (nDimension == 1), 1, "expected a matrix or a vector");
   if (nDimension == 2) {
@@ -382,7 +377,7 @@ void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, long k
 }
 
 accreal THCTensor_(trace)(THCState *state, THCTensor *src_) {
-  THAssert(THCTensor_(checkGPU)(state, 1, src_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, src_));
   THArgCheck((src_->nDimension == 2), 1, "expected a matrix");
   THCTensor *diag = THCTensor_(new)(state);
   THCTensor_(diag)(state, diag, src_, 0);
@@ -391,4 +386,67 @@ accreal THCTensor_(trace)(THCState *state, THCTensor *src_) {
   return trace;
 }
 
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+void THCTensor_(linspace)(THCState *state, THCTensor *r_, real a, real b, long n) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
+  THArgCheck(n > 1 || (n == 1 && (a == b)), 3, "invalid number of points");
+  if (THCTensor_(nElement)(state, r_) != n) THCTensor_(resize1d)(state, r_, n);
+  if (n == 1) THCTensor_(fill)(state, r_, a);
+  else {
+    THCTensor *r = THCTensor_(isContiguous)(state, r_)
+                   ? r_ // if r_ is contiguous we can direct work on it
+                   : THCTensor_(newContiguous)(state, r_);
+    real step = THCNumerics<real>::div(THCNumerics<real>::sub(b, a),
+                                       ScalarConvert<long,real>::to(n - 1));
+    LinspaceOp<real> linspace_method(a, step);
+    thrust::device_ptr<real> data_(THCTensor_(data)(state, r));
+    thrust::tabulate(data_, data_ + n, linspace_method);
+    if (!THCTensor_(isContiguous)(state, r_)) { // We need to move data back to r_
+      THCTensor_(freeCopyTo)(state, r, r_);
+    }
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(logspace)(THCState *state, THCTensor *r_, real a, real b, long n) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
+  THArgCheck(n > 1 || (n == 1 && (a == b)), 3, "invalid number of points");
+  if (THCTensor_(nElement)(state, r_) != n) THCTensor_(resize1d)(state, r_, n);
+  if (n == 1) THCTensor_(fill)(state, r_, THCNumerics<real>::exp10(a));
+  else {
+    THCTensor *r = THCTensor_(isContiguous)(state, r_)
+                   ? r_
+                   : THCTensor_(newContiguous)(state, r_);
+    real step = THCNumerics<real>::div(THCNumerics<real>::sub(b, a),
+                                       ScalarConvert<long,real>::to(n - 1));
+    LogspaceOp<real> logspace_method(a, step);
+    thrust::device_ptr<real> data_(THCTensor_(data)(state, r));
+    thrust::tabulate(data_, data_ + n, logspace_method);
+    if (!THCTensor_(isContiguous)(state, r_)) {
+      THCTensor_(freeCopyTo)(state, r, r_);
+    }
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
+
+void THCTensor_(range)(THCState *state, THCTensor *r_, accreal xmin, accreal xmax, accreal step) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
+  THArgCheck(step > 0 || step < 0, 3, "step must be a non-null number");
+  THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin))
+              , 2, "upper bound and larger bound incoherent with step sign");
+  ptrdiff_t size = (ptrdiff_t) (((xmax - xmin) / step) + 1);
+  if (THCTensor_(nElement)(state, r_) != size) THCTensor_(resize1d)(state, r_, size);
+  THCTensor *r = THCTensor_(isContiguous)(state, r_)
+                 ? r_
+                 : THCTensor_(newContiguous)(state, r_);
+  LinspaceOp<real,accreal> linspace_method(xmin, step);
+  thrust::device_ptr<real> data_(THCTensor_(data)(state, r));
+  thrust::tabulate(data_, data_ + size, linspace_method);
+  if (!THCTensor_(isContiguous)(state, r_)) THCTensor_(freeCopyTo)(state, r, r_);
+  THCudaCheck(cudaGetLastError());
+}
+
 #endif
diff --git a/lib/THC/generic/THCTensorMath.h b/lib/THC/generic/THCTensorMath.h
index 2b8f563..aae6775 100644
--- a/lib/THC/generic/THCTensorMath.h
+++ b/lib/THC/generic/THCTensorMath.h
@@ -18,5 +18,13 @@ THC_API void THCTensor_(triu)(THCState *state, THCTensor *self, THCTensor *src,
 THC_API void THCTensor_(diag)(THCState *state, THCTensor *self, THCTensor *src, long k);
 THC_API accreal THCTensor_(trace)(THCState *state, THCTensor *self);
 
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(linspace)(THCState *state, THCTensor *r_, real a, real b, long n);
+THC_API void THCTensor_(logspace)(THCState *state, THCTensor *r_, real a, real b, long n);
+
+#endif
+
+THC_API void THCTensor_(range)(THCState *state, THCTensor *r_, accreal xmin, accreal xmax, accreal step);
 
 #endif
diff --git a/lib/THC/generic/THCTensorMathBlas.cu b/lib/THC/generic/THCTensorMathBlas.cu
index f8d85cf..0d47750 100644
--- a/lib/THC/generic/THCTensorMathBlas.cu
+++ b/lib/THC/generic/THCTensorMathBlas.cu
@@ -6,7 +6,7 @@ THC_API accreal
 THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
-  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   THArgCheck(THCTensor_(nElement)(state, self) ==
              THCTensor_(nElement)(state, src), 2, "sizes do not match");
 
@@ -44,7 +44,7 @@ THC_API void
 THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
-  THAssert(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
   if( (mat->nDimension != 2) || (vec->nDimension != 1) )
     THError("matrix and vector expected");
 
@@ -135,7 +135,7 @@ THC_API void
 THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
-  THAssert(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
   if ( (vec1->nDimension != 1) || (vec2->nDimension != 1) ) {
     THError("vector and vector expected");
   }
@@ -154,7 +154,9 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
     THCTensor_(copy)(state, r_, t);
   }
 
-  if(THCNumerics<real>::ne(beta, ScalarConvert<int, real>::to(1))) {
+  if(THCNumerics<real>::eq(beta, ScalarConvert<int, real>::to(0))) {
+    THCTensor_(zero)(state, r_);
+  } else if(THCNumerics<real>::ne(beta, ScalarConvert<int, real>::to(1))) {
     THCTensor_(mul)(state, r_, r_, beta);
   }
 
@@ -227,7 +229,7 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
 
-  THAssert(THCTensor_(checkGPU)(state, 4, r_, t, m1, m2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, m1, m2));
   char transpose_r, transpose_m1, transpose_m2;
   THCTensor *r__, *m1_, *m2_;
 
@@ -378,7 +380,7 @@ THC_API void
 THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
                    real alpha, THCTensor *batch1, THCTensor *batch2) {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-  THAssert(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
   THArgCheck(THCTensor_(nDimension)(state, t) == 2, 4, "expected 2D tensor");
   THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
   THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
@@ -430,8 +432,8 @@ __global__ void createBatchGemmBuffer(const real** buffer, real* data,
 THC_API void
 THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
                     real alpha, THCTensor *batch1, THCTensor *batch2) {
-#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-  THAssert(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
   THArgCheck(THCTensor_(nDimension)(state, t) == 3, 4, "expected 3D tensor");
   THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
   THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
@@ -522,8 +524,10 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
     ldb = batch2_->stride[1];
   }
 
-  // Compute pointers to matrices in each batch.
   long num_batches = result_->size[0];
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  // Compute pointers to matrices in each batch.
   size_t matrices_size = num_batches * sizeof(real*);
 
   // Copy pointers to device.
@@ -580,6 +584,24 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
   THCudaFree(state, d_matrices2);
   THCudaFree(state, d_result_matrices);
 
+#elif defined(THC_REAL_IS_HALF)
+  // Currently no HgemmBatched in Cublas
+  for (long i = 0; i < num_batches; ++i) {
+    THCudaBlas_Hgemm(
+        state,
+        transpose_batch1,
+        transpose_batch2,
+        result_->size[transpose_result ? 2 : 1],
+        result_->size[transpose_result ? 1 : 2],
+        batch1_->size[transpose_result ? 1 : 2],
+        alpha,
+        THCTensor_(data)(state, batch1_) + i * batch1_->stride[0], lda,
+        THCTensor_(data)(state, batch2_) + i * batch2_->stride[0], ldb,
+        beta,
+        THCTensor_(data)(state, result_) + i * result_->stride[0], ldc);
+  }
+#endif
+
   if (batch1_ != batch1) {
     THCTensor_(free)(state, batch1_);
   }
@@ -597,4 +619,208 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
 #endif
 }
 
+THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, THCTensor *a)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THAssert(THCTensor_(checkGPU)(state, 2, ra_, a));
+  THArgCheck(THCTensor_(nDimension)(state, a) == 3, 3, "expected 3D tensor");
+  THArgCheck(THCTensor_(size)(state, a, 1) ==
+             THCTensor_(size)(state, a, 2), 3, "matrices must be square");
+
+  if (ra_ != a) {
+    THCTensor_(resizeAs)(state, ra_, a);
+    // not sure if this is kosher, but things are nicer if we return in column major
+    if (ra_->stride[0] == 1) {
+      THCTensor_(transpose)(state, ra_, NULL, 1, 0);
+    } else if (ra_->stride[2] == 1) {
+      THCTensor_(transpose)(state, ra_, NULL, 1, 2);
+    }
+    THCTensor_(copy)(state, ra_, a);
+  }
+
+
+  int n = a->size[1];
+  int lda;
+  THCTensor *ra__;
+
+  if (ra_->stride[1] == 1) {
+    // column ordered, what BLAS wants
+    lda = ra_->stride[2];
+    ra__ = ra_;
+  } else {
+    // not column ordered, need to make it such (requires copy)
+    THCTensor *transp_r_ = THCTensor_(newTranspose)(state, ra_, 1, 2);
+    ra__ = THCTensor_(newClone)(state, transp_r_);
+    THCTensor_(free)(state, transp_r_);
+    THCTensor_(transpose)(state, ra__, NULL, 1, 2);
+    lda = ra__->stride[2];
+  }
+
+  long num_batches = ra__->size[0];
+
+  THCudaIntTensor_resize2d(state, rpivots_, num_batches, n);
+  int *pivots_gpu = THCudaIntTensor_data(state, rpivots_);
+
+  bool free_rinfo_ = !rinfo_;
+  if (rinfo_ == NULL) rinfo_ = THCudaIntTensor_new(state);
+  THCudaIntTensor_resize1d(state, rinfo_, num_batches);
+  int *info_gpu = THCudaIntTensor_data(state, rinfo_);
+
+  // Copy pointers to device.
+  real **d_result;
+  size_t matrices_size = num_batches * sizeof(real*);
+  THCudaCheck(THCudaMalloc(state, (void**)&d_result, matrices_size));
+
+  const long block = 512;
+  const long grid = (num_batches + block - 1) / block;
+  createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    (const real**)d_result, THCTensor_(data)(state, ra__),
+    ra__->stride[0], num_batches);
+
+#ifdef THC_REAL_IS_FLOAT
+  THCudaBlas_Sgetrf(state, n, d_result, lda, pivots_gpu, info_gpu, num_batches);
+#elif defined(THC_REAL_IS_DOUBLE)
+  THCudaBlas_Dgetrf(state, n, d_result, lda, pivots_gpu, info_gpu, num_batches);
+#endif
+
+  THCudaFree(state, d_result);
+
+  if (ra__ != ra_) {
+    THCTensor_(freeCopyTo)(state, ra__, ra_);
+  }
+
+  if (free_rinfo_) {
+    real min = THCudaIntTensor_minall(state, rinfo_);
+    real max = THCudaIntTensor_maxall(state, rinfo_);
+    THCudaIntTensor_free(state, rinfo_);
+    if (min != 0 || max != 0) {
+      THError("failed to factorize some batch elements (min info == %d, max info == %d)",
+              min, max);
+    }
+  }
+
+#else
+  THError("unimplemented data type");
+#endif
+}
+
+
+THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b,
+                              THCTensor *atf, THCudaIntTensor *pivots)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THAssert(THCTensor_(checkGPU)(state, 3, rb_, atf, b));
+  THArgCheck(THCTensor_(nDimension)(state, atf) == 3, 3, "expected 3D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, b) == 3 ||
+             THCTensor_(nDimension)(state, b) == 2, 4, "expected 2D or 3D tensor");
+  THArgCheck(THCTensor_(size)(state, atf, 0) ==
+             THCTensor_(size)(state, b, 0), 3, "number of batches must be equal");
+  THArgCheck(THCTensor_(size)(state, atf, 1) ==
+             THCTensor_(size)(state, atf, 2), 3, "A matrices must be square");
+  THArgCheck(THCTensor_(size)(state, atf, 1) ==
+             THCTensor_(size)(state, b, 1), 3, "dimensions of A and b must be equal");
+
+  if (rb_ != b) {
+    THCTensor_(resizeAs)(state, rb_, b);
+    THCTensor_(copy)(state, rb_, b);
+  }
+
+
+  int n = atf->size[1];
+  int nrhs = rb_->nDimension > 2 ? rb_->size[2] : 1;
+  THCTensor *atf_;
+  THCTensor *rb__;
+  int lda, ldb;
+
+  // correct ordering of A_tf
+  if (atf->stride[1] == 1) {
+    // column ordered, what BLAS wants
+    lda = atf->stride[2];
+    atf_ = atf;
+  } else {
+    // not column ordered, need to make it such (requires copy)
+    // it would be nice if we could use the op(A) flags to automatically
+    // transpose A if needed, but this leads to unpredictable behavior if the
+    // user clones A_tf later with a different ordering
+    THCTensor *transp_r_ = THCTensor_(newTranspose)(state, atf, 1, 2);
+    atf_ = THCTensor_(newClone)(state, transp_r_);
+    THCTensor_(free)(state, transp_r_);
+    THCTensor_(transpose)(state, atf_, NULL, 1, 2);
+    lda = atf_->stride[2];
+  }
+
+  // correct ordering of B
+  if (rb_->stride[1] == 1) {
+    // column ordered
+    if (rb_->nDimension == 2 || rb_->size[2] == 1) {
+      ldb = n;
+    } else {
+      ldb = rb_->stride[2];
+    }
+    rb__ = rb_;
+  } else {
+    // make column ordered
+    if (rb_->nDimension > 2) {
+      THCTensor *transp_r_ = THCTensor_(newTranspose)(state, rb_, 1, 2);
+      rb__ = THCTensor_(newClone)(state, transp_r_);
+      THCTensor_(free)(state, transp_r_);
+      THCTensor_(transpose)(state, rb__, NULL, 1, 2);
+      ldb = rb__->stride[2];
+    } else {
+      rb__ = THCTensor_(newClone)(state, rb_);
+      ldb = n;
+    }
+  }
+
+  long num_batches = rb_->size[0];
+  size_t matrices_size = num_batches * sizeof(real*);
+
+  // Copy pointers to device.
+  real **d_result;
+  const real **d_atf;
+  THCudaCheck(THCudaMalloc(state, (void**)&d_result, matrices_size));
+  THCudaCheck(THCudaMalloc(state, (void**)&d_atf, matrices_size));
+
+  const long block = 512;
+  const long grid = (num_batches + block - 1) / block;
+  createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    (const real**)d_result, THCTensor_(data)(state, rb__),
+    rb__->stride[0], num_batches);
+  createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    d_atf, THCTensor_(data)(state, atf_),
+    atf_->stride[0], num_batches);
+
+  if (!THCudaIntTensor_isContiguous(state, pivots)) {
+      THError("Error: pivots is not contiguous.");
+  }
+
+  int *pivots_data = THCudaIntTensor_data(state, pivots);
+  int info;
+
+#ifdef THC_REAL_IS_FLOAT
+  THCudaBlas_Sgetrs(state, 'n', n, nrhs, d_atf, lda, pivots_data, d_result, ldb, &info, num_batches);
+#elif defined(THC_REAL_IS_DOUBLE)
+  THCudaBlas_Dgetrs(state, 'n', n, nrhs, d_atf, lda, pivots_data, d_result, ldb, &info, num_batches);
+#endif
+
+  if (info < 0) {
+    THError("Illegal arg %d", -info);
+  }
+
+  THCudaFree(state, d_result);
+  THCudaFree(state, d_atf);
+
+  if (atf_ != atf) {
+    THCTensor_(free)(state, atf_);
+  }
+
+  if (rb__ != rb_) {
+    THCTensor_(freeCopyTo)(state, rb__, rb_);
+  }
+
+#else
+  THError("unimplemented data type");
+#endif
+}
+
 #endif
diff --git a/lib/THC/generic/THCTensorMathBlas.h b/lib/THC/generic/THCTensorMathBlas.h
index f37910c..1d9ddfa 100644
--- a/lib/THC/generic/THCTensorMathBlas.h
+++ b/lib/THC/generic/THCTensorMathBlas.h
@@ -9,5 +9,8 @@ THC_API void THCTensor_(addr)(THCState *state, THCTensor *self, real beta, THCTe
 THC_API void THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2);
 THC_API void THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2);
 
+THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, THCTensor *a);
+THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *atf, THCudaIntTensor *pivots);
+
 
 #endif
diff --git a/lib/THC/generic/THCTensorMathCompare.cu b/lib/THC/generic/THCTensorMathCompare.cu
index 77f1ab5..079583c 100644
--- a/lib/THC/generic/THCTensorMathCompare.cu
+++ b/lib/THC/generic/THCTensorMathCompare.cu
@@ -4,7 +4,7 @@
 
 THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorLTValueOp<typename TensorUtils<THCTensor>::DataType,
                    unsigned char>(value));
@@ -12,7 +12,7 @@ THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTe
 
 THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorGTValueOp<typename TensorUtils<THCTensor>::DataType,
                    unsigned char>(value));
@@ -20,7 +20,7 @@ THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTe
 
 THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorLEValueOp<typename TensorUtils<THCTensor>::DataType,
                    unsigned char>(value));
@@ -28,7 +28,7 @@ THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTe
 
 THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorGEValueOp<typename TensorUtils<THCTensor>::DataType,
                    unsigned char>(value));
@@ -36,7 +36,7 @@ THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTe
 
 THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorEQValueOp<typename TensorUtils<THCTensor>::DataType,
                    unsigned char>(value));
@@ -44,7 +44,7 @@ THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTe
 
 THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorNEValueOp<typename TensorUtils<THCTensor>::DataType,
                    unsigned char>(value));
@@ -52,7 +52,7 @@ THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTe
 
 THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorLTValueOp<typename TensorUtils<THCTensor>::DataType,
                    typename TensorUtils<THCTensor>::DataType>(value));
@@ -60,7 +60,7 @@ THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *
 
 THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorGTValueOp<typename TensorUtils<THCTensor>::DataType,
                    typename TensorUtils<THCTensor>::DataType>(value));
@@ -68,7 +68,7 @@ THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *
 
 THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorLEValueOp<typename TensorUtils<THCTensor>::DataType,
                    typename TensorUtils<THCTensor>::DataType>(value));
@@ -76,7 +76,7 @@ THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *
 
 THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorGEValueOp<typename TensorUtils<THCTensor>::DataType,
                    typename TensorUtils<THCTensor>::DataType>(value));
@@ -84,7 +84,7 @@ THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *
 
 THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorEQValueOp<typename TensorUtils<THCTensor>::DataType,
                    typename TensorUtils<THCTensor>::DataType>(value));
@@ -92,7 +92,7 @@ THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *
 
 THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue(state, self_, src,
                    TensorNEValueOp<typename TensorUtils<THCTensor>::DataType,
                    typename TensorUtils<THCTensor>::DataType>(value));
diff --git a/lib/THC/generic/THCTensorMathCompareT.cu b/lib/THC/generic/THCTensorMathCompareT.cu
index 4b59abf..e541641 100644
--- a/lib/THC/generic/THCTensorMathCompareT.cu
+++ b/lib/THC/generic/THCTensorMathCompareT.cu
@@ -5,7 +5,7 @@
 THC_API void
 THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorLTOp<typename TensorUtils<THCTensor>::DataType,
                     unsigned char>());
@@ -14,7 +14,7 @@ THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
 THC_API void
 THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorGTOp<typename TensorUtils<THCTensor>::DataType,
                     unsigned char>());
@@ -23,7 +23,7 @@ THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
 THC_API void
 THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorLEOp<typename TensorUtils<THCTensor>::DataType,
                     unsigned char>());
@@ -32,7 +32,7 @@ THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
 THC_API void
 THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorGEOp<typename TensorUtils<THCTensor>::DataType,
                     unsigned char>());
@@ -41,7 +41,7 @@ THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
 THC_API void
 THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorEQOp<typename TensorUtils<THCTensor>::DataType,
                     unsigned char>());
@@ -50,7 +50,7 @@ THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
 THC_API void
 THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorNEOp<typename TensorUtils<THCTensor>::DataType,
                     unsigned char>());
@@ -59,7 +59,7 @@ THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
 THC_API void
 THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorLTOp<typename TensorUtils<THCTensor>::DataType,
                     typename TensorUtils<THCTensor>::DataType>());
@@ -68,7 +68,7 @@ THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
 THC_API void
 THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorGTOp<typename TensorUtils<THCTensor>::DataType,
                     typename TensorUtils<THCTensor>::DataType>());
@@ -77,7 +77,7 @@ THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
 THC_API void
 THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorLEOp<typename TensorUtils<THCTensor>::DataType,
                     typename TensorUtils<THCTensor>::DataType>());
@@ -86,7 +86,7 @@ THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
 THC_API void
 THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorGEOp<typename TensorUtils<THCTensor>::DataType,
                     typename TensorUtils<THCTensor>::DataType>());
@@ -95,7 +95,7 @@ THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
 THC_API void
 THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorEQOp<typename TensorUtils<THCTensor>::DataType,
                     typename TensorUtils<THCTensor>::DataType>());
@@ -104,7 +104,7 @@ THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
 THC_API void
 THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor(state, self_, src1, src2,
                     TensorNEOp<typename TensorUtils<THCTensor>::DataType,
                     typename TensorUtils<THCTensor>::DataType>());
diff --git a/lib/THC/generic/THCTensorMathMagma.cu b/lib/THC/generic/THCTensorMathMagma.cu
index 635834d..c35a83e 100644
--- a/lib/THC/generic/THCTensorMathMagma.cu
+++ b/lib/THC/generic/THCTensorMathMagma.cu
@@ -10,7 +10,7 @@ static void THCTensor_(copyArray1d)(THCState *state, THCTensor *self, real *src,
 {
   long size[1] = { k };
   long stride[1] = { 1 };
-  THCTensor_(rawResize)(state, self, 1, size, stride);
+  THCTensor_(resizeNd)(state, self, 1, size, stride);
   size_t len = k * sizeof(real);
   THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
 }
@@ -19,7 +19,7 @@ static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src,
 {
   long size[2] = { m, n };
   long stride[2] = { 1, m };
-  THCTensor_(rawResize)(state, self, 2, size, stride);
+  THCTensor_(resizeNd)(state, self, 2, size, stride);
   size_t len = m * n * sizeof(real);
   THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
 }
@@ -54,7 +54,7 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T
   long size[2] = { src->size[0], src->size[1] };
   long stride[2] = { 1, src->size[0] };
 
-  THCTensor_(rawResize)(state, self, 2, size, stride);
+  THCTensor_(resizeNd)(state, self, 2, size, stride);
   THCTensor_(copy)(state, self, src);
   return self;
 }
@@ -286,13 +286,14 @@ THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_,
 #ifdef USE_MAGMA
   THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
 
-  magma_vec_t jobu = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec;
-  magma_vec_t jobvt = jobu;
+  magma_vec_t jobz = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec;
 
+  int iunused[1];
   int m = a->size[0];
   int n = a->size[1];
   int k = m < n ? m : n;
-  int j = (jobu == MagmaAllVec) ? m : k;
+  int j = (jobz == MagmaAllVec) ? m : k;
+  int jv = (jobz == MagmaAllVec) ? n : k;
 
   real *a_data = th_magma_malloc_pinned<real>(m * n);
   THCTensor_(copyTensor2d)(state, a_data, a);
@@ -305,32 +306,36 @@ THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_,
   int info;
 
 #if defined(THC_REAL_IS_FLOAT)
-  magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info);
+  magma_sgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, iunused, &info);
 #else
-  magma_dgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info);
+  magma_dgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, iunused, &info);
 #endif
 
   int lwork = (int) wkopt;
   real *work_data = th_magma_malloc_pinned<real>(lwork);
+  int *iwork = th_magma_malloc_pinned<int>(8 * k);
 
 #if defined(THC_REAL_IS_FLOAT)
-  magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info);
+  magma_sgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, iwork, &info);
 #else
-  magma_dgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info);
+  magma_dgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, iwork, &info);
 #endif
 
   if (info > 0)
-    THError("MAGMA gesvd : %d superdiagonals failed to converge", info);
+    THError("MAGMA gesdd : the updating process of SBDSDC did not converge (error: %d)", info);
   else if (info < 0)
-    THError("MAGMA gesvd : Argument %d : illegal value", -info);
+    THError("MAGMA gesdd : Argument %d : illegal value", -info);
 
   THCTensor_(copyArray2d)(state, rv_, rv_data, n, n);
   THCTensor_(transpose)(state, rv_, NULL, 0, 1);
+  if (jobz != MagmaAllVec)
+    THCTensor_(narrow)(state, rv_, rv_, 1, 0, jv);
   THCTensor_(copyArray2d)(state, ru_, ru_data, m, j);
   THCTensor_(copyArray1d)(state, rs_, rs_data, k);
   THCTensor_(copyArray2d)(state, ra_, a_data,  m, n);
 
   magma_free_pinned(work_data);
+  magma_free_pinned(iwork);
   magma_free_pinned(rv_data);
   magma_free_pinned(ru_data);
   magma_free_pinned(rs_data);
@@ -453,6 +458,11 @@ THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
 
   THCudaCheck(THCudaFree(state, ipiv_gpu));
   THCudaCheck(THCudaFree(state, info_gpu));
+
+  THCudaCheck(THCudaFree(state, d_matrices1));
+  THCudaCheck(THCudaFree(state, d_matrices1_const));
+  THCudaCheck(THCudaFree(state, d_matrices2));
+
   THCTensor_(freeCopyTo)(state, output, input);
 #endif
 }
@@ -598,19 +608,42 @@ THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THC
   int k = (m < n ? m : n);
 
 #ifdef MAGMA_V2
+#if defined(THC_REAL_IS_FLOAT)
   int nb = magma_get_sgeqrf_nb(m, n);
 #else
+  int nb = magma_get_dgeqrf_nb(m, n);
+#endif
+#else
+#if defined(THC_REAL_IS_FLOAT)
   int nb = magma_get_sgeqrf_nb(m);
+#else
+  int nb = magma_get_dgeqrf_nb(m);
+#endif
 #endif
 
   real *a_data = THCTensor_(data)(state, a);
-  real *tau_data = th_magma_malloc_pinned<real>(n*n);
-
-  THCTensor *work = THCTensor_(newWithSize1d)(state, (2*k + ((n+31)/32)*32)*nb);
+  real *tau_data = th_magma_malloc_pinned<real>(k);
+  THCTensor *work = THCTensor_(newWithSize1d)(state, (2*k + magma_roundup(n, 32))*nb);
   real *work_data = THCTensor_(data)(state, work);
 
   int info;
 #if defined(THC_REAL_IS_FLOAT)
+  magma_sgeqrf2_gpu(m, n, a_data, m, tau_data, &info);
+#else
+  magma_dgeqrf2_gpu(m, n, a_data, m, tau_data, &info);
+#endif
+
+  if (info != 0)
+    THError("MAGMA geqrf2 : Argument %d : illegal value.", -info);
+
+  THCTensor_(narrow)(state, a, a, 0, 0, k);
+  THCTensor_(triu)(state, rr_, a, 0);
+  THCTensor_(free)(state, a);
+
+  a = THCTensor_(newColumnMajor)(state, rq_, a_);
+  a_data = THCTensor_(data)(state, a);
+
+#if defined(THC_REAL_IS_FLOAT)
   magma_sgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
 #else
   magma_dgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
@@ -622,10 +655,6 @@ THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THC
   THCTensor *q = THCTensor_(newColumnMajor)(state, rq_, a);
   real *q_data = THCTensor_(data)(state, q);
 
-  THCTensor_(narrow)(state, a, a, 0, 0, k);
-  THCTensor_(triu)(state, rr_, a, 0);
-  THCTensor_(free)(state, a);
-
 #if defined(THC_REAL_IS_FLOAT)
   magma_sorgqr_gpu(m, k, k, q_data, m, tau_data, work_data, nb, &info);
 #else
diff --git a/lib/THC/generic/THCTensorMathPairwise.cu b/lib/THC/generic/THCTensorMathPairwise.cu
index 0b4094b..def5970 100644
--- a/lib/THC/generic/THCTensorMathPairwise.cu
+++ b/lib/THC/generic/THCTensorMathPairwise.cu
@@ -5,7 +5,7 @@
 THC_API void
 THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
     if (!THC_pointwiseApply1(state, self_, TensorAddConstantOp<real>(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -24,7 +24,7 @@ THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
 THC_API void
 THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
     if (!THC_pointwiseApply1(state, self_, TensorSubConstantOp<real>(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -43,7 +43,7 @@ THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
 THC_API void
 THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
     if (!THC_pointwiseApply1(state, self_, TensorMulConstantOp<real>(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -62,7 +62,7 @@ THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
 THC_API void
 THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   THArgCheck(value != ScalarConvert<int, real>::to(0), 3, "divide by zero");
 
   if (self_ == src_) {
@@ -81,9 +81,57 @@ THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
 }
 
 THC_API void
+THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THCTensor_(mul)(state, self_, src_, pow(2, value));
+#elif defined(THC_REAL_IS_HALF)
+  return THError("lshift not supported for torch.CudaHalfTensor");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorLShiftConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorLShiftConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THCTensor_(mul)(state, self_, src_, pow(2, value));
+#elif defined(THC_REAL_IS_HALF)
+  return THError("rshift not supported for torch.CudaHalfTensor");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorRShiftConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorRShiftConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
 THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
     if (!THC_pointwiseApply1(state, self_, TensorFmodOp<real>(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -102,7 +150,7 @@ THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
 THC_API void
 THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
     if (!THC_pointwiseApply1(state, self_, TensorRemainderOp<real>(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -120,7 +168,7 @@ THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, real v
 
 void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, long k)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   THArgCheck(src_->nDimension == 2, 1, "expected a matrix");
 
   THCTensor *src = src_;
@@ -153,7 +201,7 @@ void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, long k
 
 void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, long k)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   THArgCheck(src_->nDimension == 2, 1, "expected a matrix");
 
   THCTensor *src = src_;
@@ -186,7 +234,7 @@ void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, long k
 
 THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (!THCTensor_(isSameSizeAs(state, self_, src_))) {
     return 0;
   }
@@ -210,4 +258,70 @@ THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_
   return min != 0;
 }
 
+THC_API void
+THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  return THError("bitand only supported for integer type tensors");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorBitAndConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorBitAndConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  return THError("bitor only supported for integer type tensors");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorBitOrConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorBitOrConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  return THError("bitxor only supported for integer type tensors");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorBitXorConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorBitXorConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
 #endif
diff --git a/lib/THC/generic/THCTensorMathPairwise.h b/lib/THC/generic/THCTensorMathPairwise.h
index 261c203..8b6bcd6 100644
--- a/lib/THC/generic/THCTensorMathPairwise.h
+++ b/lib/THC/generic/THCTensorMathPairwise.h
@@ -6,8 +6,13 @@ THC_API void THCTensor_(add)(THCState *state, THCTensor *self, THCTensor *src, r
 THC_API void THCTensor_(sub)(THCState *state, THCTensor *self, THCTensor *src, real value);
 THC_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, real value);
 THC_API void THCTensor_(div)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(lshift)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(rshift)(THCState *state, THCTensor *self, THCTensor *src, real value);
 THC_API void THCTensor_(fmod)(THCState *state, THCTensor *self, THCTensor *src, real value);
 THC_API void THCTensor_(remainder)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(bitand)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(bitor)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(bitxor)(THCState *state, THCTensor *self, THCTensor *src, real value);
 
 THC_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src);
 
diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu
index b97908a..cdf4b82 100644
--- a/lib/THC/generic/THCTensorMathPointwise.cu
+++ b/lib/THC/generic/THCTensorMathPointwise.cu
@@ -14,7 +14,7 @@
   };                                                                    \
                                                                         \
   void THCTensor_(NAME)(THCState* state, THCTensor* self_, THCTensor* src) { \
-    THAssert(THCTensor_(checkGPU)(state, 2, self_, src));               \
+    THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));               \
     if (self_ == src) {                                                 \
       if (!THC_pointwiseApply1(state, self_, Tensor_##NAME##_##REAL##_Op())) { \
         THArgCheck(false, 2, CUTORCH_DIM_WARNING);                      \
@@ -36,6 +36,7 @@
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  log, THCNumerics<real>::log,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(lgamma, THCNumerics<real>::lgamma, Real)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(log1p, THCNumerics<real>::log1p, Real)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  exp, THCNumerics<real>::exp,   Real)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  cos, THCNumerics<real>::cos,   Real)
@@ -66,7 +67,7 @@ IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  abs, THCNumerics<real>::abs,   Real)
 #undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC
 
 void THCTensor_(sign)(THCState* state, THCTensor* self_, THCTensor* src) {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   if (self_ == src) {
     if (!THC_pointwiseApply1(state, self_, TensorSignOp<real>())) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -85,7 +86,7 @@ void THCTensor_(sign)(THCState* state, THCTensor* self_, THCTensor* src) {
 void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, real min_value,
   real max_value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   if (self_ == src) {
     if (!THC_pointwiseApply1(state, self_, TensorClampOp<real>(min_value, max_value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -104,7 +105,7 @@ void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, real m
 THC_API void
 THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self, x, y));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, x, y));
 
   int i;
   long nd = THCTensor_(nDimension)(state, x);
@@ -140,7 +141,7 @@ THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y,
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
 void THCTensor_(sigmoid)(THCState* state, THCTensor* self_, THCTensor* src) {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   if (self_ == src) {
     if (!THC_pointwiseApply1(state, self_, TensorSigmoidOp<real>())) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -157,7 +158,7 @@ void THCTensor_(sigmoid)(THCState* state, THCTensor* self_, THCTensor* src) {
 }
 
 void THCTensor_(pow)(THCState *state, THCTensor *self_, THCTensor *src, real value) {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   if (self_ == src) {
     if (!THC_pointwiseApply1(state, self_, TensorPowOp<real>(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -175,7 +176,7 @@ void THCTensor_(pow)(THCState *state, THCTensor *self_, THCTensor *src, real val
 
 void THCTensor_(tpow)(THCState *state, THCTensor *self_, real value, THCTensor *src)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   if (self_ == src) {
     if (!THC_pointwiseApply1(state, self_, TensorTPowOp<real>(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
@@ -194,7 +195,7 @@ void THCTensor_(tpow)(THCState *state, THCTensor *self_, real value, THCTensor *
 THC_API void
 THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, real w)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, result, a, b));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, result, a, b));
   THArgCheck(THCTensor_(nElement)(state, a) ==
              THCTensor_(nElement)(state, b), 3, "sizes do not match");
   THCTensor_(resizeAs)(state, result, a);
@@ -211,7 +212,7 @@ THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b,
 THC_API void
 THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
              THCTensor_(nElement)(state, src2), 3, "sizes do not match");
 
@@ -249,7 +250,7 @@ THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, real value,
 THC_API void
 THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
              THCTensor_(nElement)(state, src2), 3, "sizes do not match");
 
@@ -291,7 +292,7 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value,
 THC_API void
 THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
              THCTensor_(nElement)(state, src2), 3, "sizes do not match");
 
@@ -315,7 +316,7 @@ THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *
 THC_API void
 THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
              THCTensor_(nElement)(state, src2), 3, "sizes do not match");
 
@@ -339,7 +340,7 @@ THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *
 THC_API void
 THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
              THCTensor_(nElement)(state, src2), 3, "sizes do not match");
 
@@ -361,9 +362,65 @@ THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *
 }
 
 THC_API void
+THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF)
+  return THError("clshift not supported for torch.CudaHalfTensor");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2(state, self_, src2, TensorLShiftOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3(state, self_, src1, src2, TensorLShiftOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF)
+  return THError("crshift not supported for torch.CudaHalfTensor");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2(state, self_, src2, TensorRShiftOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3(state, self_, src1, src2, TensorRShiftOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
 THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
              THCTensor_(nElement)(state, src2), 2, "sizes do not match");
 
@@ -382,7 +439,7 @@ THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s
 THC_API void
 THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
              THCTensor_(nElement)(state, src2), 2, "sizes do not match");
 
@@ -401,7 +458,7 @@ THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s
 THC_API void
 THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
              THCTensor_(nElement)(state, src2), 2, "sizes do not match");
 
@@ -420,7 +477,7 @@ THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTen
 THC_API void
 THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
              THCTensor_(nElement)(state, src2), 2, "sizes do not match");
 
@@ -439,7 +496,7 @@ THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *
 THC_API void
 THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
 
   if (self == src) {
     if (!THC_pointwiseApply1(state, self, TensorMaxValueOp<real>(value))) {
@@ -456,7 +513,7 @@ THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real val
 THC_API void
 THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
 
   if (self == src) {
     if (!THC_pointwiseApply1(state, self, TensorMinValueOp<real>(value))) {
@@ -473,7 +530,7 @@ THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real val
 THC_API void
 THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, real value, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
   if(self_ != t)
   {
     THCTensor_(resizeAs)(state, self_, t);
@@ -498,7 +555,7 @@ THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, real value,
 THC_API void
 THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, real value, THCTensor *src1, THCTensor *src2)
 {
-  THAssert(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
   if(self_ != t)
   {
     THCTensor_(resizeAs)(state, self_, t);
@@ -519,4 +576,87 @@ THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, real value,
   THCudaCheck(cudaGetLastError());
 }
 
+THC_API void
+THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  return THError("cbitand is only supported for integer type tensors");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2(state, self_, src2, TensorBitAndOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitAndOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  return THError("cbitor is only supported for integer type tensors");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2(state, self_, src2, TensorBitOrOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitOrOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+THC_API void
+THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  return THError("cbitor is only supported for integer type tensors");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2(state, self_, src2, TensorBitXorOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitXorOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
 #endif
diff --git a/lib/THC/generic/THCTensorMathPointwise.h b/lib/THC/generic/THCTensorMathPointwise.h
index 34e594a..17171c0 100644
--- a/lib/THC/generic/THCTensorMathPointwise.h
+++ b/lib/THC/generic/THCTensorMathPointwise.h
@@ -6,6 +6,7 @@
 
 THC_API void THCTensor_(sigmoid)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(log)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(lgamma)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(log1p)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(exp)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(cos)(THCState *state, THCTensor *self, THCTensor *src);
@@ -44,12 +45,17 @@ THC_API void THCTensor_(csub)(THCState *state, THCTensor *self, THCTensor *src1,
 THC_API void THCTensor_(cmul)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(cpow)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(cdiv)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(clshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(crshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real value);
 THC_API void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(cbitand)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cbitor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cbitxor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 
 THC_API void THCTensor_(addcmul)(THCState *state, THCTensor *self, THCTensor* t, real value, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(addcdiv)(THCState *state, THCTensor *self, THCTensor* t, real value, THCTensor *src1, THCTensor *src2);
diff --git a/lib/THC/generic/THCTensorMathReduce.cu b/lib/THC/generic/THCTensorMathReduce.cu
index ed0e204..bbc950e 100644
--- a/lib/THC/generic/THCTensorMathReduce.cu
+++ b/lib/THC/generic/THCTensorMathReduce.cu
@@ -3,13 +3,14 @@
 #else
 
 THC_API void
-THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, long dimension) {
-  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, long dimension, int keepdim) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   if (!THC_reduceDim(state, self, src,
                      thrust::identity<real>(),
                      ReduceAdd<real, real>(),
                      ScalarConvert<int, real>::to(0),
-                     dimension)) {
+                     dimension,
+                     keepdim)) {
     THArgCheck(false, 2, CUTORCH_DIM_WARNING);
   }
 
@@ -17,13 +18,14 @@ THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, long dimension
 }
 
 THC_API void
-THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, long dimension) {
-  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, long dimension, int keepdim) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   if (!THC_reduceDim(state, self, src,
                      thrust::identity<real>(),
                      ReduceMultiply<real, real>(),
                      ScalarConvert<int, real>::to(1),
-                     dimension)) {
+                     dimension,
+                     keepdim)) {
     THArgCheck(false, 2, CUTORCH_DIM_WARNING);
   }
 
@@ -31,10 +33,10 @@ THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, long dimensio
 }
 
 THC_API void
-THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim)
+THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim, int keepdim)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
-  THCTensor_(sum)(state, self, src, dim);
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
+  THCTensor_(sum)(state, self, src, dim, keepdim);
   THCTensor_(div)(state, self, self, ScalarConvert<long, real>::to(THCTensor_(size)(state, src, dim)));
 }
 
@@ -43,7 +45,7 @@ THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim)
 THC_API void
 THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension, real maxnorm)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   THCTensor *self_;
   THCTensor *src_ = THCTensor_(newTranspose)(state, src, dimension, 0);
   THCTensor *data = THCTensor_(newClone)(state, src_);
@@ -70,9 +72,9 @@ THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value,
 }
 
 THC_API void
-THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, long dimension, int flag)
+THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, long dimension, int flag, int keepdim)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THLongStorage *dim = THCTensor_(newSizeOf)(state, src);
   THLongStorage_set(dim, dimension, 1);
   THCTensor_(resize)(state, self_, dim, NULL);
@@ -89,12 +91,16 @@ THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, long dimensio
 
   THCTensor_(free)(state, src);
   THCTensor_(freeCopyTo)(state, self, self_);
+
+  if (!keepdim) {
+    THCTensor_(squeeze1d)(state, self_, self_, dimension);
+  }
 }
 
 THC_API void
-THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, long dimension, int flag)
+THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, long dimension, int flag, int keepdim)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THLongStorage *dim = THCTensor_(newSizeOf)(state, src);
   THLongStorage_set(dim, dimension, 1);
   THCTensor_(resize)(state, self_, dim, NULL);
@@ -111,19 +117,23 @@ THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, long dimensio
 
   THCTensor_(free)(state, src);
   THCTensor_(freeCopyTo)(state, self, self_);
+
+  if (!keepdim) {
+    THCTensor_(squeeze1d)(state, self_, self_, dimension);
+  }
 }
 
 THC_API accreal
 THCTensor_(stdall)(THCState *state, THCTensor *self)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   return THCNumerics<accreal>::sqrt((THCTensor_(varall)(state, self)));
 }
 
 THC_API accreal
 THCTensor_(varall)(THCState *state, THCTensor *self)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal mean = THCTensor_(meanall)(state, self);
 
   accreal val;
@@ -146,28 +156,28 @@ THCTensor_(varall)(THCState *state, THCTensor *self)
 }
 
 THC_API void
-THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension)
+THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension, int keepdim)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(0.0))) {
     THC_reduceDim(state, self, src,
                   TensorNonZeroOp<real>(), ReduceAdd<real, real>(),
-                  ScalarConvert<float, real>::to(0.0), dimension);
+                  ScalarConvert<float, real>::to(0.0), dimension, keepdim);
   } else if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(1.0))) {
     THC_reduceDim(state, self, src,
                   TensorNormOp<real, 1>(value), ReduceAdd<real, real>(),
-                  ScalarConvert<float, real>::to(0.0), dimension);
+                  ScalarConvert<float, real>::to(0.0), dimension, keepdim);
 
   } else if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(2.0))) {
     THC_reduceDim(state, self, src,
                   TensorNormOp<real, 2>(value), ReduceAdd<real, real>(),
-                  ScalarConvert<float, real>::to(0.0), dimension);
+                  ScalarConvert<float, real>::to(0.0), dimension, keepdim);
     THCTensor_(pow)(state, self, self, ScalarConvert<float, real>::to(0.5));
 
   } else {
     THC_reduceDim(state, self, src,
                   TensorNormOp<real, -1>(value), ReduceAdd<real, real>(),
-                  ScalarConvert<float, real>::to(0.0), dimension);
+                  ScalarConvert<float, real>::to(0.0), dimension, keepdim);
     THCTensor_(pow)(state, self, self, THCNumerics<real>::cinv(value));
   }
 
@@ -177,7 +187,7 @@ THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, l
 THC_API accreal
 THCTensor_(normall)(THCState *state, THCTensor *self, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal result;
 
   if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(0.0))) {
@@ -222,7 +232,7 @@ THCTensor_(normall)(THCState *state, THCTensor *self, real value)
 accreal THCTensor_(dist)(THCState *state, THCTensor *self,
                          THCTensor *src, real value)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   self = THCTensor_(newContiguous)(state, self);
   ptrdiff_t size = THCTensor_(nElement)(state, self);
   src = THCTensor_(newContiguous)(state, src);
@@ -248,7 +258,7 @@ accreal THCTensor_(dist)(THCState *state, THCTensor *self,
 
 THC_API accreal
 THCTensor_(sumall)(THCState *state, THCTensor *self) {
-  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal val;
   if (!THC_reduceAll(state, self,
                      thrust::identity<real>(),
@@ -265,7 +275,7 @@ THCTensor_(sumall)(THCState *state, THCTensor *self) {
 
 THC_API accreal
 THCTensor_(prodall)(THCState *state, THCTensor *self) {
-  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal val;
   if (!THC_reduceAll(state, self,
                      thrust::identity<real>(),
@@ -276,11 +286,6 @@ THCTensor_(prodall)(THCState *state, THCTensor *self) {
     THArgCheck(false, 1, CUTORCH_DIM_WARNING);
   }
 
-  val = THCNumerics<accreal>::div(
-    val,
-    ScalarConvert<long, accreal>::to(THCTensor_(nElement)(state, self)) - 1
-  );
-
   THCudaCheck(cudaGetLastError());
   return val;
 }
@@ -288,14 +293,14 @@ THCTensor_(prodall)(THCState *state, THCTensor *self) {
 THC_API accreal
 THCTensor_(meanall)(THCState *state, THCTensor *self)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   THArgCheck(self->nDimension > 0, 1, "empty Tensor");
   return THCTensor_(sumall)(state, self)/THCTensor_(nElement)(state, self);
 }
 
 THC_API real
 THCTensor_(minall)(THCState *state, THCTensor *self) {
-  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   real val;
   if (!THC_reduceAll(state, self,
                      thrust::identity<real>(),
@@ -311,7 +316,7 @@ THCTensor_(minall)(THCState *state, THCTensor *self) {
 
 THC_API real
 THCTensor_(maxall)(THCState *state, THCTensor *self) {
-  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   real val;
   if (!THC_reduceAll(state, self,
                      thrust::identity<real>(),
@@ -330,8 +335,9 @@ THCTensor_(max)(THCState *state,
                 THCTensor *values,
                 THCudaLongTensor *indices,
                 THCTensor *src,
-                long dimension) {
-  THAssert(THCTensor_(checkGPU)(state, 3, values, indices, src));
+                long dimension,
+                int keepdim) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src));
 
   thrust::pair<typename TensorUtils<THCTensor>::DataType, long>
     init =
@@ -339,7 +345,7 @@ THCTensor_(max)(THCState *state,
       THCNumerics<typename TensorUtils<THCTensor>::DataType>::min(), 1);
 
   return THC_reduceDimIndex(
-    state, values, indices, src, dimension, init,
+    state, values, indices, src, dimension, keepdim, init,
     MaxValuePair<typename TensorUtils<THCTensor>::DataType, long>());
 }
 
@@ -348,8 +354,9 @@ THCTensor_(min)(THCState *state,
                 THCTensor *values,
                 THCudaLongTensor *indices,
                 THCTensor *src,
-                long dimension) {
-  THAssert(THCTensor_(checkGPU)(state, 3, values, indices, src));
+                long dimension,
+                int keepdim) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src));
 
   thrust::pair<typename TensorUtils<THCTensor>::DataType, long>
     init =
@@ -357,7 +364,7 @@ THCTensor_(min)(THCState *state,
       THCNumerics<typename TensorUtils<THCTensor>::DataType>::max(), 1);
 
   return THC_reduceDimIndex(
-    state, values, indices, src, dimension, init,
+    state, values, indices, src, dimension, keepdim, init,
     MinValuePair<typename TensorUtils<THCTensor>::DataType, long>());
 }
 
diff --git a/lib/THC/generic/THCTensorMathReduce.h b/lib/THC/generic/THCTensorMathReduce.h
index dc38ed6..095be42 100644
--- a/lib/THC/generic/THCTensorMathReduce.h
+++ b/lib/THC/generic/THCTensorMathReduce.h
@@ -5,9 +5,9 @@
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
 THC_API void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension, real max_norm);
-THC_API void THCTensor_(std)(THCState *state, THCTensor *self, THCTensor *src, long dim, int flag);
-THC_API void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension);
-THC_API void THCTensor_(var)(THCState *state, THCTensor *self, THCTensor *src, long dim, int flag);
+THC_API void THCTensor_(std)(THCState *state, THCTensor *self, THCTensor *src, long dim, int flag, int keepdim);
+THC_API void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension, int keepdim);
+THC_API void THCTensor_(var)(THCState *state, THCTensor *self, THCTensor *src, long dim, int flag, int keepdim);
 
 THC_API accreal THCTensor_(stdall)(THCState *state, THCTensor *self);
 THC_API accreal THCTensor_(normall)(THCState *state, THCTensor *self, real value);
@@ -15,9 +15,9 @@ THC_API accreal THCTensor_(varall)(THCState *state, THCTensor *self);
 
 #endif
 
-THC_API void THCTensor_(sum)(THCState *state, THCTensor *self, THCTensor *src, long dim);
-THC_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, long dim);
-THC_API void THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim);
+THC_API void THCTensor_(sum)(THCState *state, THCTensor *self, THCTensor *src, long dim, int keepdim);
+THC_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, long dim, int keepdim);
+THC_API void THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim, int keepdim);
 
 THC_API accreal THCTensor_(sumall)(THCState *state, THCTensor *self);
 THC_API accreal THCTensor_(prodall)(THCState *state, THCTensor *self);
@@ -26,11 +26,11 @@ THC_API accreal THCTensor_(meanall)(THCState *state, THCTensor *self);
 THC_API void THCTensor_(min)(THCState *state,
                              THCTensor *values,
                              THCudaLongTensor *indices,
-                             THCTensor *src, long dim);
+                             THCTensor *src, long dim, int keepdim);
 THC_API void THCTensor_(max)(THCState *state,
                              THCTensor *values,
                              THCudaLongTensor *indices,
-                             THCTensor *src, long dim);
+                             THCTensor *src, long dim, int keepdim);
 
 THC_API real THCTensor_(minall)(THCState *state, THCTensor *self);
 THC_API real THCTensor_(maxall)(THCState *state, THCTensor *self);
diff --git a/lib/THC/generic/THCTensorMathScan.cu b/lib/THC/generic/THCTensorMathScan.cu
index 8a8e434..8242139 100644
--- a/lib/THC/generic/THCTensorMathScan.cu
+++ b/lib/THC/generic/THCTensorMathScan.cu
@@ -2,6 +2,27 @@
 #define THC_GENERIC_FILE "generic/THCTensorMathScan.cu"
 #else
 
+#ifndef THC_REAL_IS_HALF
+template<class BinaryFunction>
+__host__ void THCTensor_(scanThrust)(
+    THCState *state,
+    THCTensor *dst,
+    THCTensor *src,
+    BinaryFunction binary_op)
+{
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<real> src_data(THCTensor_(data)(state, src));
+  thrust::device_ptr<real> dst_data(THCTensor_(data)(state, dst));
+  ptrdiff_t size = THCTensor_(nElement)(state, src);
+  thrust::inclusive_scan(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      src_data, src_data + size, dst_data,
+      binary_op);
+}
+#endif
+
 template<class BinaryOp>
 __host__ void THCTensor_(scanOuterDim)(THCState *state, THCTensor *tgt,
                                        THCTensor *src, long dimension,
@@ -57,12 +78,22 @@ template<class BinaryFunction>
 void THCTensor_(scanDim)(THCState *state, THCTensor *self_, THCTensor *src,
                          long dimension, real init, BinaryFunction binary_op)
 {
-  THCTensor_(resizeAs)(state, self_, src);
+  // "init" must be the identity element for binary_op
+  int ndim = THCTensor_(nDimension)(state, src);
+  THArgCheck(dimension >= 0 && dimension < ndim, 3, "dimension %d out of range",
+      dimension + TH_INDEX_BASE);
 
+  THCTensor_(resizeAs)(state, self_, src);
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   src = THCTensor_(newContiguous)(state, src);
 
-  if (dimension == THCTensor_(nDimension)(state, src) - 1) {
+#ifndef THC_REAL_IS_HALF
+  if (ndim == 1) {
+    // thrust does not take an "init"
+    THCTensor_(scanThrust)(state, self, src, binary_op);
+  } else
+#endif
+  if (dimension == ndim - 1) {
     THCTensor_(scanInnermostDim)(state, self, src, init, binary_op);
   } else {
     THCTensor_(scanOuterDim)(state, self, src, dimension, init, binary_op);
@@ -74,14 +105,14 @@ void THCTensor_(scanDim)(THCState *state, THCTensor *self_, THCTensor *src,
 
 void THCTensor_(cumsum)(THCState *state, THCTensor *self, THCTensor *src, long dimension)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   return THCTensor_(scanDim)(state, self, src, dimension,
                              ScalarConvert<float, real>::to(0.0), AddOp<real>());
 }
 
 void THCTensor_(cumprod)(THCState *state, THCTensor *self, THCTensor *src, long dimension)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   return THCTensor_(scanDim)(state, self, src, dimension,
                              ScalarConvert<float, real>::to(1.0), MulOp<real>());
 }
diff --git a/lib/THC/generic/THCTensorMode.cu b/lib/THC/generic/THCTensorMode.cu
new file mode 100644
index 0000000..e5a17f2
--- /dev/null
+++ b/lib/THC/generic/THCTensorMode.cu
@@ -0,0 +1,315 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMode.cu"
+#else
+
+THC_API void THCTensor_(calculateMode)(THCState *state,
+                                        THCTensor *values,
+                                        THCudaLongTensor *indices,
+                                        THCTensor *input,
+                                        THCudaLongStorage *sortBuffer,
+                                        int dimension,
+                                        THLongStorage *position) {
+  THAssert(THCTensor_(isContiguous)(state, input));
+
+  // Because the input is contiguous, we want to get a reference to the
+  // location of the buffer at the innermost dimension that we are going
+  // to calculate the mode for --> we do this by manually doing the stride
+  // calculations to get an offset
+  real *data = THCTensor_(data)(state, input);
+  for (int i = 0; i < THLongStorage_size(position); ++i) {
+    data += THLongStorage_data(position)[i] * THCTensor_(stride)(state, input, i);
+  }
+
+  long nElement = THCTensor_(size)(state, input, THCTensor_(nDimension)(state, input) - 1);
+  THCThrustAllocator thrustAlloc(state);
+
+  // Wrap input data, sortBuffer, in Thrust device vectors
+  thrust::device_ptr<real> vecPtr = thrust::device_pointer_cast(data);
+  thrust::device_vector<real> iter(vecPtr, vecPtr + nElement);
+  thrust::device_ptr<long> sbPtr = thrust::device_pointer_cast(THCudaLongStorage_data(state, sortBuffer));
+  thrust::device_vector<long> seq(sbPtr, sbPtr + nElement);
+
+  // Fill sortBuffer with [0, 1, 2, ... nElement - 1]
+  thrust::sequence(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    seq.begin(), seq.end());
+
+  // Sort the input data. The original indices of the data are stored in seq
+  thrust::sort_by_key(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    iter.begin(), iter.end(), seq.begin()
+#if defined(THC_REAL_IS_HALF)
+    , ThrustHalfLess()
+#endif
+  );
+
+  // Count # of unique elements via an inner product between adjacent elements.
+  // Add 1 if two neighboring element are not equal.
+  int unique = 1 + thrust::inner_product(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    iter.begin(), iter.end() - 1, iter.begin() + 1, 0, thrust::plus<int>(),
+#if defined(THC_REAL_IS_HALF)
+    ThrustHalfNotEqualTo()
+#else
+    thrust::not_equal_to<real>()
+#endif
+  );
+
+  // Count frequency of each element
+  thrust::device_vector<real> keys(unique);
+  thrust::device_vector<int> counts(unique);
+  thrust::reduce_by_key(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    iter.begin(), iter.end(),
+    thrust::constant_iterator<int>(1), keys.begin(), counts.begin()
+#if defined(THC_REAL_IS_HALF)
+    , ThrustHalfEqualTo()
+#endif
+  );
+
+  // Find index of maximum count
+  thrust::device_vector<int>::iterator it = thrust::max_element(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    counts.begin(), counts.end());
+  real mode = keys[it - counts.begin()];
+
+  // Find first index within which it occurs
+#if defined(THC_REAL_IS_HALF)
+  thrust::device_vector<real>::iterator positionIter = thrust::find_if(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    iter.begin(), iter.end(), ThrustHalfEqualToPredicate(mode));
+#else
+  thrust::device_vector<real>::iterator positionIter = thrust::find(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    iter.begin(), iter.end(), mode);
+#endif
+
+  THAssert(positionIter != iter.end());
+  long index = TH_INDEX_BASE + seq[positionIter - iter.begin()];
+
+  // Place mode, index in output
+  ptrdiff_t valuesOffset = THCTensor_(storageOffset)(state, values);
+  long indicesOffset = THCudaLongTensor_storageOffset(state, indices);
+
+  for (int i = 0; i < THLongStorage_size(position); ++i) {
+    long pos = THLongStorage_data(position)[i];
+    valuesOffset += THCTensor_(stride)(state, values, i) * pos;
+    indicesOffset += THCudaLongTensor_stride(state, indices, i) * pos;
+  }
+  THCStorage_(set)(state, THCTensor_(storage)(state, values), valuesOffset, mode);
+  THCudaLongStorage_set(state, THCudaLongTensor_storage(state, indices), indicesOffset, index);
+}
+
+// this probably could be a loop, not a recursive algorithm
+THC_API void THCTensor_(dimApplyMode)(THCState *state,
+                               THCTensor *values,
+                               THCudaLongTensor *indices,
+                               THCTensor *input,
+                               THCudaLongStorage *sortBuffer,
+                               int dimension,
+                               THLongStorage *position,
+                               int curDim) {
+  long ndim = THCTensor_(nDimension)(state, input);
+
+  // Because we have transposed the Tensor, the data for the dimension we are mode'ing along
+  // is always in the innermost dimension
+  if (curDim == ndim - 1) {
+    THCTensor_(calculateMode)(state, values, indices, input, sortBuffer, dimension, position);
+  } else {
+    // Loop through the values and recurse
+    for (int i = 0; i < THCTensor_(size)(state, input, curDim); ++i) {
+      position->data[curDim] = i;
+      THCTensor_(dimApplyMode)(state, values, indices, input, sortBuffer, dimension, position, curDim + 1);
+    }
+  }
+}
+
+#define MAX_GRID_SIZE  65535
+#define MAX_BLOCK_SIZE 1024
+
+THC_API void THCTensor_(mode)(THCState *state,
+                              THCTensor *values,
+                              THCudaLongTensor *indices,
+                              THCTensor *input,
+                              int dimension,
+                              int keepdim) {
+  THLongStorage *dim;
+  THCTensor *transposed, *contiguous, *valuesTransposed;
+  THLongStorage *position;
+  THCudaLongStorage *sortBuffer;
+  THCudaLongTensor *indicesTransposed;
+  long ndim, sliceSize, slices;
+
+
+  THAssert(THCTensor_(checkGPU)(state, 1, values));
+
+  // Verify they are asking for a valid dimension
+  ndim = THCTensor_(nDimension)(state, input);
+  THArgCheck(dimension >= 0 && dimension < ndim, 4, "Dimension of out bounds");
+
+  sliceSize = THCTensor_(size)(state, input, dimension);
+  slices = THCTensor_(nElement)(state, input) / sliceSize;
+
+  // Resize output value, index Tensors to appropriate sizes (i.e. the same as
+  // the input Tensor, except at dim=dimension, the size is 1)
+  dim = THCTensor_(newSizeOf)(state, input);
+  THLongStorage_set(dim, dimension, 1);
+  THCTensor_(resize)(state, values, dim, NULL);
+  THCudaLongTensor_resize(state, indices, dim, NULL);
+  THLongStorage_free(dim);
+
+  // If sliceSize is 1, copy input to values and set indices
+  if (sliceSize == 1) {
+    THCTensor_(copy)(state, values, input);
+    THCudaLongTensor_fill(state, indices, TH_INDEX_BASE);
+    return;
+  }
+
+  // Requirements for fused kernel implementation:
+  //
+  // 1. sliceSize <= 2 * max threads per block
+  // 2. uses one block per slice, so number of slices must be less than the maximum number of blocks for
+  // a kernel launch
+  // 3. Can use 32-bit index math for indexing (mainly just for implementation conciseness, could be changed)
+  if (sliceSize <= MAX_BLOCK_SIZE &&
+      slices <= MAX_GRID_SIZE &&
+      TensorUtils<THCTensor>::canUse32BitIndexMath(state, input)) {
+    // Beginning our optimized implementation. First thing we want to do is to transpose
+    // the input Tensor along the sort dimension, and then make it contiguous
+    transposed = THCTensor_(newTranspose)(state, input, dimension, ndim - 1);
+    contiguous = THCTensor_(newContiguous)(state, transposed);
+
+    // We also need to view the values and indices Tensors as transposed in order to
+    // properly determine the offset into the underlying storage in which to place the
+    // mode and index for a particular set of dimension values
+    valuesTransposed = THCTensor_(newTranspose)(state, values, dimension, ndim-1);
+    indicesTransposed = THCudaLongTensor_newTranspose(state, indices, dimension, ndim-1);
+
+    // Set-up TensorInfo structs for passing to kernel
+    TensorInfo<real, unsigned int> tiValues = getTensorInfo<THCTensor, unsigned int>(state, valuesTransposed);
+    TensorInfo<long, unsigned int> tiIndices = getTensorInfo<THCudaLongTensor, unsigned int>(state, indicesTransposed);
+
+    // The number of blocks is the number of slices that we need to calculate the mode for. Each block
+    // is responsible for computing a single mode
+    dim3 grid;
+    THC_getGridFromTiles(slices, grid);
+
+    // The blocksize is two elements per thread, rounded up to the nearest power of 2
+    long ceilPowerOf2 = nextHighestPowerOf2(sliceSize);
+
+    // Macro that calls kernel --> note that we set the block dimensions here, and
+    // the amount of shared memory
+  #define HANDLE_MODE(SIZE) \
+  { \
+    dim3 blockSize(SIZE / 2); \
+\
+    int memsize = (sizeof(real) * SIZE) + (2 * SIZE * sizeof(unsigned int)); \
+    computeMode<real, SIZE> \
+      <<<grid, blockSize, memsize, THCState_getCurrentStream(state)>>>( \
+        THCTensor_(data)(state, contiguous), tiValues, tiIndices, sliceSize); \
+  }
+
+    // Tradeoff between compilation time and the number of specializations. Ideally we would have
+    // one HANDLE_MODE for each power of 2
+    switch(ceilPowerOf2) {
+      case 2048:
+        HANDLE_MODE(2048)
+        break;
+      case 1024:
+      case 512:
+      case 256:
+        HANDLE_MODE(1024)
+        break;
+      case 128:
+      case 64:
+        HANDLE_MODE(128)
+        break;
+      case 32:
+      case 16:
+      case 8:
+      case 4:
+      case 2:
+        HANDLE_MODE(32)
+        break;
+      case 1:
+      default:
+        assert(false);
+    }
+    THCudaCheck(cudaGetLastError());
+
+    THCTensor_(free)(state, transposed);
+    THCTensor_(free)(state, contiguous);
+    THCTensor_(free)(state, valuesTransposed);
+    THCudaLongTensor_free(state, indicesTransposed);
+  } else {
+    // Beginning our naive implementation: We don't want to mutate the input Tensor, but
+    // we need to be able to sort the inputs along the dimension in order to calculate the
+    // mode. Additionally, its ideal if the data along the dimension is contiguous. So
+    // we transpose the dimension with the innermost dimension and make a new contiguous
+    // version that we can use.
+    transposed = THCTensor_(newClone)(state, input);
+    THCTensor_(transpose)(state, transposed, NULL, dimension, ndim - 1);
+    contiguous = THCTensor_(newContiguous)(state, transposed);
+    THCTensor_(free)(state, transposed);
+
+    // We also need to view the values and indices Tensors as transposed in order to
+    // properly determine the offset into the underlying storage in which to place the
+    // mode and index for a particular set of dimension values
+    valuesTransposed = THCTensor_(newTranspose)(state, values, dimension, ndim - 1);
+    indicesTransposed = THCudaLongTensor_newTranspose(state, indices, dimension, ndim - 1);
+
+    // Position is a Storage that will store the dimension values we are processing
+    position = THLongStorage_newWithSize(ndim - 1);
+
+    // Sort Buffer is a Storage that will be used in the internal sort required to calculate
+    // the mode efficiently
+    sortBuffer = THCudaLongStorage_newWithSize(state, sliceSize);
+
+    // Call mode
+    THCTensor_(dimApplyMode)(state, valuesTransposed, indicesTransposed, contiguous, sortBuffer, dimension, position, 0);
+
+    THCTensor_(free)(state, contiguous);
+    THLongStorage_free(position);
+    THCTensor_(free)(state, valuesTransposed);
+    THCudaLongTensor_free(state, indicesTransposed);
+    THCudaLongStorage_free(state, sortBuffer);
+  }
+
+  if (!keepdim) {
+    THCTensor_(squeeze1d)(state, values, values, dimension);
+    THCudaLongTensor_squeeze1d(state, indices, indices, dimension);
+  }
+}
+
+#undef MAX_GRID_SIZE
+#undef MAX_BLOCK_SIZE
+
+#endif
diff --git a/lib/THC/generic/THCTensorMode.h b/lib/THC/generic/THCTensorMode.h
new file mode 100644
index 0000000..6f24380
--- /dev/null
+++ b/lib/THC/generic/THCTensorMode.h
@@ -0,0 +1,14 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMode.h"
+#else
+
+/* Returns the mode, and index of the mode, for the set of values
+ * along a given dimension in the input tensor. */
+THC_API void THCTensor_(mode)(THCState *state,
+                              THCTensor *values,
+                              THCudaLongTensor *indices,
+                              THCTensor *input,
+                              int dimension,
+                              int keepdim);
+
+#endif // THC_GENERIC_FILE
diff --git a/lib/THC/generic/THCTensorRandom.cu b/lib/THC/generic/THCTensorRandom.cu
index f6d6979..4c6d2fb 100644
--- a/lib/THC/generic/THCTensorRandom.cu
+++ b/lib/THC/generic/THCTensorRandom.cu
@@ -8,7 +8,7 @@
 
 THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   Generator* gen = THCRandom_getGenerator(state);
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   ptrdiff_t size = THCTensor_(nElement)(state, self);
@@ -22,7 +22,7 @@ THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, do
 
 THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   Generator* gen = THCRandom_getGenerator(state);
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   ptrdiff_t size = THCTensor_(nElement)(state, self);
@@ -37,7 +37,7 @@ THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean,
 THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv)
 {
 
-  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   Generator* gen = THCRandom_getGenerator(state);
 
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
@@ -52,7 +52,7 @@ THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mea
 
 THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   Generator* gen = THCRandom_getGenerator(state);
 
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
@@ -67,7 +67,7 @@ THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double l
 
 THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   Generator* gen = THCRandom_getGenerator(state);
 
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
@@ -107,7 +107,7 @@ THC_API void THCTensor_(multinomial)(struct THCState *state,
                                       int n_sample,
                                       int with_replacement)
 {
-  THAssert(THCTensor_(checkGPU)(state, 2, self, prob_dist));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, prob_dist));
   Generator* gen = THCRandom_getGenerator(state);
 
   int inputSize = THCTensor_(nDimension)(state, prob_dist);
@@ -159,8 +159,9 @@ THC_API void THCTensor_(multinomial)(struct THCState *state,
     int maxThreads = props->maxThreadsPerBlock;
     dim3 block(numCategories < maxThreads ? numCategories : maxThreads);
     dim3 grid(numDist < numSM * 4 ? numDist : numSM * 4);
-    sampleMultinomialOnce
-      <<<grid, block, block.x * sizeof(real),
+    sampleMultinomialOnce<real, accreal>
+      <<<grid, block,
+         block.x * (sizeof(real) * sizeof(accreal)),
          THCState_getCurrentStream(state)>>>(
       THCudaLongTensor_data(state, self),
       numDist,
@@ -266,14 +267,14 @@ THC_API void THCTensor_(multinomial)(struct THCState *state,
 
 THC_API void THCTensor_(rand)(THCState *state, THCTensor *r_, THLongStorage *size)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
   THCTensor_(resize)(state, r_, size, NULL);
   THCTensor_(uniform)(state, r_, 0, 1);
 }
 
 void THCTensor_(randn)(THCState *state, THCTensor *r_, THLongStorage *size)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_));
   THCTensor_(resize)(state, r_, size, NULL);
   THCTensor_(normal)(state, r_, 0, 1);
 }
@@ -288,7 +289,7 @@ GENERATE_KERNEL1(generate_bernoulli, real, double p, float, curand_uniform, (Sca
 
 THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   Generator* gen = THCRandom_getGenerator(state);
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   ptrdiff_t size = THCTensor_(nElement)(state, self);
@@ -304,7 +305,7 @@ THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
 THC_API void THCTensor_(NAME)(THCState* state,                                 \
         THCTensor *self_, PROB_TYPE *probs_)                                   \
 {                                                                              \
-  THAssert(THCTensor_(checkGPU)(state, 2, self_, probs_));                     \
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, probs_));                     \
   Generator* gen = THCRandom_getGenerator(state);                              \
   THCTensor *self = THCTensor_(newContiguous)(state, self_);                   \
   PROB_TYPE *probs = PROB_TYPE##_newContiguous(state, probs_);                 \
@@ -334,7 +335,7 @@ GENERATE_KERNEL1(generate_geometric, real, double p, float, curand_uniform, (Sca
 
 THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
 {
-  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   Generator* gen = THCRandom_getGenerator(state);
 
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
diff --git a/lib/THC/generic/THCTensorScatterGather.cu b/lib/THC/generic/THCTensorScatterGather.cu
index c120f88..c3afbbf 100644
--- a/lib/THC/generic/THCTensorScatterGather.cu
+++ b/lib/THC/generic/THCTensorScatterGather.cu
@@ -9,8 +9,8 @@
 
 void THCTensor_(gather)(THCState* state, THCTensor *tensor,
                          THCTensor *src, int dim, THCudaLongTensor *index) {
-  THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
-  THAssert(THCudaLongTensor_checkGPU(state, 1, index));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
 
   THArgCheck(THCTensor_(nDimension)(state, src) == THCTensor_(nDimension)(state, tensor), 2,
              "Input tensor must have same dimensions as output tensor");
@@ -102,8 +102,8 @@ void THCTensor_(gather)(THCState* state, THCTensor *tensor,
     tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements);
 
 void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src) {
-  THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
-  THAssert(THCudaLongTensor_checkGPU(state, 1, index));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
 
   THArgCheck(dim >= 0 && dim < THCTensor_(nDimension)(state, tensor), 2,
              "Index dimension is out of bounds");
@@ -191,8 +191,8 @@ void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLong
 void
 THCTensor_(scatterFill)(THCState* state, THCTensor *tensor,
                          int dim, THCudaLongTensor *index, real value) {
-  THAssert(THCTensor_(checkGPU)(state, 1, tensor));
-  THAssert(THCudaLongTensor_checkGPU(state, 1, index));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
 
   THArgCheck(dim >= 0 && dim < THCTensor_(nDimension)(state, tensor), 2,
              "Index dimension is out of bounds");
diff --git a/lib/THC/generic/THCTensorSort.cu b/lib/THC/generic/THCTensorSort.cu
index afef796..067af89 100644
--- a/lib/THC/generic/THCTensorSort.cu
+++ b/lib/THC/generic/THCTensorSort.cu
@@ -281,8 +281,8 @@ THC_API void THCTensor_(sort)(THCState* state,
                                THCudaLongTensor *indices,
                                THCTensor *input,
                                int dim, int order) {
-  THAssert(THCTensor_(checkGPU)(state, 2, sorted, input));
-  THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, sorted, input));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
   long dims = THCTensor_(nDimension)(state, sorted);
   THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
   dims = THCTensor_(nDimension)(state, input);
diff --git a/lib/THC/generic/THCTensorTopK.cu b/lib/THC/generic/THCTensorTopK.cu
new file mode 100644
index 0000000..83ab1e1
--- /dev/null
+++ b/lib/THC/generic/THCTensorTopK.cu
@@ -0,0 +1,159 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorTopK.cu"
+#else
+
+THC_API void THCTensor_(topk)(THCState* state,
+                               THCTensor *topK,
+                               THCudaLongTensor *indices,
+                               THCTensor *input,
+                               long k, int dim, int dir, int sorted) {
+  THAssert(topK != NULL && indices != NULL && input != NULL);
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, topK, indices, input));
+  THArgCheck(THCTensor_(nDimension)(state, topK) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  long dims = THCudaLongTensor_nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING);
+  int numDims = THCTensor_(nDimension)(state, input);
+  THArgCheck(numDims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
+
+  THArgCheck(dim >= 0 && dim < numDims, 6, "dim not in range");
+
+  long sliceSize = THCTensor_(size)(state, input, dim);
+  THArgCheck(k > 0 && k <= sliceSize, 5, "k not in range for dimension");
+
+  // Build the output size, which is the dim being selected set to
+  // size k
+  THLongStorage* topKSize = THCTensor_(newSizeOf)(state, input);
+  THLongStorage_set(topKSize, dim, k);
+  THCTensor_(resize)(state, topK, topKSize, NULL);
+  THCudaLongTensor_resize(state, indices, topKSize, NULL);
+  THLongStorage_free(topKSize);
+
+#define RUN_K(INDEX_T, DIM, DIR)                                        \
+  gatherTopK<real, INDEX_T, DIM, DIR>                                         \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
+      inputInfo,                                                        \
+      sliceSize,                                                        \
+      k,                                                                \
+      inputSlices,                                                      \
+      /* The actual dimension that the k-selection is running in */     \
+      /* may have changed from collapseDims() */                        \
+      inputInfo.strides[collapseInputDim],                              \
+      topKInfo,                                                         \
+      topKSlices,                                                       \
+      topKInfo.strides[collapseTopKDim],                                \
+      indicesInfo,                                                      \
+      indicesInfo.strides[collapseIndicesDim])
+
+#define RUN_DIR(INDEX_T, DIM)                   \
+  if (dir) {                                    \
+    RUN_K(INDEX_T, DIM, true);                  \
+  } else {                                      \
+    RUN_K(INDEX_T, DIM, false);                 \
+  }
+
+#define RUN_DIM(INDEX_T)                        \
+  if (allDims == 1) {                           \
+    RUN_DIR(INDEX_T, 1);                        \
+  } else if (allDims == 2) {                    \
+    RUN_DIR(INDEX_T, 2);                        \
+  } else if (allDims == 3) {                    \
+    RUN_DIR(INDEX_T, 3);                        \
+  } else {                                      \
+    RUN_DIR(INDEX_T, -1);                       \
+  }
+
+#define RUN_T(INDEX_T)                                                  \
+  TensorInfo<real, INDEX_T> inputInfo =                                \
+    getTensorInfo<THCTensor, INDEX_T>(state, input);                 \
+  TensorInfo<real, INDEX_T> topKInfo =                                 \
+    getTensorInfo<THCTensor, INDEX_T>(state, topK);                  \
+  TensorInfo<long, INDEX_T> indicesInfo =                               \
+    getTensorInfo<THCudaLongTensor, INDEX_T>(state, indices);           \
+                                                                        \
+  /* We use these structures solely to find the offset to */            \
+  /* each slice we are operating on */                                  \
+  inputInfo.sizes[dim] = 1;                                             \
+  topKInfo.sizes[dim] = 1;                                              \
+  indicesInfo.sizes[dim] = 1;                                           \
+                                                                        \
+  /* Collapse all other dims */                                         \
+  int collapseInputDim = inputInfo.collapseDims(dim);                   \
+  int collapseTopKDim = topKInfo.collapseDims(dim);                     \
+  int collapseIndicesDim = indicesInfo.collapseDims(dim);               \
+                                                                        \
+  long inputSlices = 1;                                                 \
+  long topKSlices = 1;                                                  \
+  for (int i = 0; i < numDims; ++i) {                                   \
+    inputSlices *= inputInfo.sizes[i];                                  \
+    topKSlices *= topKInfo.sizes[i];                                    \
+  }                                                                     \
+                                                                        \
+  dim3 grid;                                                            \
+  if (!THC_getGridFromTiles(inputSlices, grid)) {                       \
+    THError("Slice to sort is too large");                              \
+  }                                                                     \
+                                                                        \
+  dim3 block(std::min(THCRoundUp(sliceSize, 32L), 1024L));              \
+                                                                        \
+  /* This is used as a template parameter to calculate indices. */      \
+  /* We only specialize it if all collapsed dim sizes are the */        \
+  /* same; otherwise, we use -1 which is the specialization */          \
+  /* parameter for arbitrary dimensions */                              \
+  int allDims = inputInfo.dims;                                         \
+  if (topKInfo.dims != allDims || indicesInfo.dims != allDims) {        \
+    allDims = -1;                                                       \
+  }                                                                     \
+                                                                        \
+  RUN_DIM(INDEX_T);
+
+  // Based on required index size, run the algorithm with the
+  // appropriate index type
+  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, input) &&
+      TensorUtils<THCTensor>::canUse32BitIndexMath(state, topK) &&
+      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+    RUN_T(unsigned int);
+  } else {
+    RUN_T(unsigned long);
+  }
+#undef RUN_T
+#undef RUN_DIM
+#undef RUN_DIR
+#undef RUN_K
+
+  // Sort the results if the user wants them sorted, since our
+  // selection routine does not ensure sorting
+  if (sorted) {
+    // FIXME: the k/v inplace sort along slice only works for size <=
+    // 2048 at the moment
+    if (sliceSize <= 2048) {
+      // This avoids any memory allocations and performs all sorting
+      // work inplace along the slice
+      THCTensor_(sortKeyValueInplace)(state, topK, indices, dim, dir);
+    } else {
+      // Depend upon the backup sort that returns indices, which we
+      // can use in conjunction with gather to produce the original
+      // indices.
+      // This is not the most efficient implementation, especially since
+      // there are memory allocations performed here. If the user desires
+      // greater performance, they should torch.gather() the results
+      // themselves using the reported indices, providing previously
+      // allocated tensors to receive the results.
+      THCTensor* sortedTopK = THCTensor_(new)(state);
+      THCudaLongTensor* sortedIndices = THCudaLongTensor_new(state);
+      THCTensor_(sort)(state, sortedTopK, sortedIndices, topK, dim, dir);
+
+      THCudaLongTensor* sortedTopKIndices = THCudaLongTensor_new(state);
+
+      THCudaLongTensor_resizeAs(state, sortedTopKIndices, indices);
+      THCudaLongTensor_gather(state, sortedTopKIndices, indices, dim, sortedIndices);
+
+      THCTensor_(freeCopyTo)(state, sortedTopK, topK);
+      THCudaLongTensor_freeCopyTo(state, sortedTopKIndices, indices);
+      THCudaLongTensor_free(state, sortedIndices);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif // THC_GENERIC_FILE
diff --git a/lib/THC/generic/THCTensorTopK.h b/lib/THC/generic/THCTensorTopK.h
new file mode 100644
index 0000000..2c281b5
--- /dev/null
+++ b/lib/THC/generic/THCTensorTopK.h
@@ -0,0 +1,13 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorTopK.h"
+#else
+
+/* Returns the set of all kth smallest (or largest) elements, depending */
+/* on `dir` */
+THC_API void THCTensor_(topk)(THCState* state,
+                               THCTensor* topK,
+                               THCudaLongTensor* indices,
+                               THCTensor* input,
+                               long k, int dim, int dir, int sorted);
+
+#endif // THC_GENERIC_FILE
diff --git a/rocks/cutorch-1.0-0.rockspec b/rocks/cutorch-1.0-0.rockspec
index 07e309e..d904a52 100644
--- a/rocks/cutorch-1.0-0.rockspec
+++ b/rocks/cutorch-1.0-0.rockspec
@@ -21,16 +21,15 @@ dependencies = {
 build = {
    type = "command",
    build_command = [[
-
 jopts=$(getconf _NPROCESSORS_CONF)
 
 echo "Building on $jopts cores"
-cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DLUA_INCDIR=$(LUA_INCDIR) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install
 ]],
 	platforms = {
       windows = {
    build_command = [[
-cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DLUA_INCDIR=$(LUA_INCDIR) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
 ]]
 	  }
    },
diff --git a/rocks/cutorch-scm-1.rockspec b/rocks/cutorch-scm-1.rockspec
index 8314385..5dbdfbe 100644
--- a/rocks/cutorch-scm-1.rockspec
+++ b/rocks/cutorch-scm-1.rockspec
@@ -24,12 +24,12 @@ build = {
 jopts=$(getconf _NPROCESSORS_CONF)
 
 echo "Building on $jopts cores"
-cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DLUA_INCDIR=$(LUA_INCDIR) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install
 ]],
 	platforms = {
       windows = {
    build_command = [[
-cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DLUA_INCDIR=$(LUA_INCDIR) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
 ]]
 	  }
    },
diff --git a/test/test.lua b/test/test.lua
index 32918b1..bd78a4f 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -175,13 +175,13 @@ local function createTestTensor(maxSize)
 end
 
 local function isEqual(x, y, tolerance, ...)
-   if a == nil and b == nil then return true end
-   if a == nil and b ~= nil then return false end
-   if a ~= nil and b == nil then return false end
+   if x == nil and y == nil then return true end
+   if x == nil and y ~= nil then return false end
+   if x ~= nil and y == nil then return false end
 
-   -- clone the tensors so we can modify the contents if necessary for testing
-   local a = x:clone()
-   local b = y:clone()
+   -- if x, y are tensors clone them so we can modify the contents if necessary for testing
+   local a = type(x) ~= 'number' and x:clone() or x
+   local b = type(y) ~= 'number' and y:clone() or y
 
    if torch.type(b) ~= torch.type(a) then
       b = b:typeAs(a) -- TODO: remove the need for this (a-b doesnt work for bytetensor, cudatensor pairs)
@@ -271,7 +271,6 @@ local function compareFloatAndCuda(x, fn, ...)
 			       .. "are different for function '%s'", tostring(fn)))
    for k, _ in ipairs(rcpu) do
       if not isEqual(rcpu[k], rcuda[k], tolerance) then
-	      print(args)
 	      tester:assert(false, errstr)
       end
    end
@@ -365,10 +364,11 @@ end
 -- indexMode = true: keep indexing and masking Tensors as their CPU equivalents
 --             false: convert then to baseType when doing CUDA
 -- x = first argument tensor
+-- limit: number of returns to compare, if nil, compares all returns
 -- gpu2cpu_map = map of gpu types to cpu types
 -- fn = function name (as string), or the function)
 -- ... = the rest of arguments to fn
-local function compareCPUAndCUDATypeTensorArgsWithConv(cudaType, gpu2cpu_map, indexMode, x, fn, ...)
+local function compareCPUAndCUDATypeTensorArgsWithConvInternal(cudaType, gpu2cpu_map, indexMode, limit, x, fn, ...)
    local baseType = t2cpu[cudaType]
    assert(baseType, 'Cannot find baseType for ' .. cudaType)
    local x_cpu = x:type(baseType)
@@ -421,23 +421,30 @@ local function compareCPUAndCUDATypeTensorArgsWithConv(cudaType, gpu2cpu_map, in
    tester:assert(#rcpu == #rcuda,
 		 string.format("number of return arguments for CPU and CUDA "
 			       .. "are different for function '%s'", tostring(fn)))
-   for k, _ in ipairs(rcpu) do
-      tester:assert(isEqual(rcpu[k], rcuda[k], tolerance),
-                    string.format(errstrval, k, divval(rcpu[k], rcuda[k])))
+
+   if limit ~= nil then
+      for k = 1, limit do
+         tester:assert(isEqual(rcpu[k], rcuda[k], tolerance),
+                       string.format(errstrval, k, divval(rcpu[k], rcuda[k])))
+      end
+   else
+      for k, _ in ipairs(rcpu) do
+         tester:assert(isEqual(rcpu[k], rcuda[k], tolerance),
+                       string.format(errstrval, k, divval(rcpu[k], rcuda[k])))
+      end
    end
+
    -- also test x in case function changed object
    tester:assert(isEqual(x_cpu, x_cuda, tolerance),
                  string.format(errstrobj, divval(x_cpu, x_cuda)))
 end
 
--- baseType = the tensor type to test
--- indexMode = true: keep indexing and masking Tensors as their CPU equivalents
---             false: convert then to baseType when doing CUDA
--- x = first argument tensor
--- fn = function name (as string), or the function)
--- ... = the rest of arguments to fn
 local function compareCPUAndCUDATypeTensorArgs(cudaType, indexMode, x, fn, ...)
-   compareCPUAndCUDATypeTensorArgsWithConv(cudaType, nil, indexMode, x, fn, ...)
+   compareCPUAndCUDATypeTensorArgsWithConvInternal(cudaType, nil, indexMode, nil, x, fn, ...)
+end
+
+local function compareCPUAndCUDATypeTensorArgsWithLimit(cudaType, indexMode, limit, x, fn, ...)
+   compareCPUAndCUDATypeTensorArgsWithConvInternal(cudaType, nil, indexMode, limit, x, fn, ...)
 end
 
 function test.squeeze()
@@ -816,7 +823,7 @@ function test.copyAsync()
       cutorch.streamSynchronize(cutorch.getStream())
       tester:assertTensorEq(device_tensor:double(), host_tensor:double(), 0,
                             "Async copy to host failed.")
-  end
+   end
 end
 
 function test.largeNoncontiguous()
@@ -882,6 +889,98 @@ function test.ones()
    torch.setdefaulttensortype(t)
 end
 
+function test.linspace()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local n = sz1 * sz2
+   local a = torch.uniform()
+   local b = torch.uniform()
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   for k, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'linspace', a, b, n)
+   end
+   checkMultiDevice(x, 'linspace', a, b, n)
+
+   -- Check range for non-contiguous tensors.
+   local x = createTestTensorWithSizes(true, true, {sz1, sz2})
+   for k, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'linspace', a, b, n)
+   end
+   checkMultiDevice(x, 'linspace', a, b, n)
+
+   -- Ckeck new tensor creation
+   local x = torch.FloatTensor()
+   for k, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'linspace', a, b, n)
+   end
+   checkMultiDevice(x, 'linspace', a, b, n)
+
+   -- Ckeck n = 1 case
+   local x = torch.rand(1)
+   for k, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'linspace', a, a, 1)
+   end
+   checkMultiDevice(x, 'linspace', a, a, 1)
+
+   -- Ckeck default parameter case
+   local x = createTestTensorWithSizes(true, true, {100})
+   for k, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'linspace', a, b)
+   end
+   checkMultiDevice(x, 'linspace', a, b)
+end
+
+function test.logspace()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local n = sz1 * sz2
+   local a = torch.uniform()
+   local b = torch.uniform()
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   for k, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'logspace', a, b, n)
+   end
+   checkMultiDevice(x, 'logspace', a, b, n)
+
+   -- Check range for non-contiguous tensors.
+   local x = createTestTensorWithSizes(true, true, {sz1, sz2})
+   for k, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'logspace', a, b, n)
+   end
+   checkMultiDevice(x, 'logspace', a, b, n)
+
+   -- Ckeck new tensor creation
+   local x = torch.FloatTensor()
+   for k, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'logspace', a, b, n)
+   end
+   checkMultiDevice(x, 'logspace', a, b, n)
+
+   -- Ckeck n = 1 case
+   local x = torch.rand(1)
+   for k, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'logspace', a, a, 1)
+   end
+   checkMultiDevice(x, 'logspace', a, a, 1)
+
+   -- Ckeck default parameter case
+   local x = createTestTensorWithSizes(true, true, {100})
+   for k, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'logspace', a, b)
+   end
+   checkMultiDevice(x, 'logspace3', a, b)
+end
+
 
 function test.add()
    local sz1 = chooseInt(minsize, maxsize)
@@ -904,6 +1003,35 @@ function test.add()
    checkMultiDevice(x, 'add', y, v, z)
 end
 
+local test_bitops =  function(funcname, tmin, tmax, vmin, vmax)
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.IntTensor(sz1, sz2):random(tmin, tmax)
+   local v = torch.random(vmin, vmax)
+   compareCPUAndCUDATypeTensorArgs('torch.CudaIntTensor', nil, x, funcname, v)
+   checkMultiDevice(x, funcname, v)
+end
+
+function test.lshift()
+   test_bitops('lshift', 1, 1000, 1, 10)
+end
+
+function test.rshift()
+   test_bitops('rshift', 1000, 1000000, 1, 10)
+end
+
+function test.bitand()
+   test_bitops('bitand', 1, 1000, 1, 255)
+end
+
+function test.bitor()
+   test_bitops('bitor', 1, 1000, 1, 255)
+end
+
+function test.bitxor()
+   test_bitops('bitxor', 1, 1000, 1, 255)
+end
+
 function test.csub()
    local sz1 = chooseInt(minsize, maxsize)
    local sz2 = chooseInt(minsize, maxsize)
@@ -1481,6 +1609,60 @@ function test.diag()
    checkMultiDevice(y1, 'diag', k)
 end
 
+function test.range()
+   local xmin = chooseInt(minsize, maxsize)
+   local xmax = chooseInt(xmin, maxsize)
+   local step = 3
+   local size = math.floor((xmax - xmin) / step + 1)
+   -- Base case
+   local x = torch.FloatTensor():rand(size)
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmin, xmax, step)
+   end
+   checkMultiDevice(x, 'range', xmin, xmax, step)
+
+   -- Check range for non-contiguous tensors.
+   local x = createTestTensorWithSizes(true, true, {size})
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmin, xmax, step)
+   end
+   checkMultiDevice(x, 'range', xmin, xmax, step)
+
+   -- Ckeck new tensor creation
+   local x = torch.Tensor()
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmin, xmax, step)
+   end
+   checkMultiDevice(x, 'range', xmin, xmax, step)
+
+   -- Ckeck negative step case
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmax, xmin, -step)
+   end
+   checkMultiDevice(x, 'range', xmax, xmin, -step)
+
+   -- Ckeck default parameter case
+   local x = createTestTensorWithSizes(true, true, {size})
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmin, xmax)
+   end
+   checkMultiDevice(x, 'range', xmin, xmax, step)
+
+   -- Ckeck floating step case
+   local step = 1.3
+   local x = torch.Tensor()
+   for k, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'range', xmin, xmax)
+   end
+   checkMultiDevice(x, 'range', xmin, xmax, step)
+end
+
 function test.trace()
    local sz1 = chooseInt(minsize, maxsize)
    local sz2 = chooseInt(minsize, maxsize)
@@ -1877,10 +2059,10 @@ local function testIndexAdd(types, gpu2cpu_map)
    for k, typename in ipairs(types) do
       local ctype = t2cpu[typename]
       local x, src = x:type(ctype), src:type(ctype)
-      compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, true, x, 'indexAdd',
+      compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, true, nil, x, 'indexAdd',
                                               index, longIndex, src)
       if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
-          compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, false, x, 'indexAdd',
+          compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, false, nil, x, 'indexAdd',
                                                   index, longIndex, src)
       end
    end
@@ -1892,10 +2074,10 @@ local function testIndexAdd(types, gpu2cpu_map)
    for k, typename in ipairs(types) do
       local ctype = t2cpu[typename]
       local x, src = x:type(ctype), src:type(ctype)
-      compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, true, x, 'indexAdd',
+      compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, true, nil, x, 'indexAdd',
                                               index, longIndex, src)
       if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
-          compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, false, x, 'indexAdd',
+          compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, false, nil, x, 'indexAdd',
                                                   index, longIndex, src)
       end
    end
@@ -1908,10 +2090,10 @@ local function testIndexAdd(types, gpu2cpu_map)
    for k, typename in ipairs(types) do
       local ctype = t2cpu[typename]
       local x, src = x:type(ctype), src:type(ctype)
-      compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, true, x, 'indexAdd',
+      compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, true, nil, x, 'indexAdd',
                                               index, longIndex, src)
       if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
-          compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, false, x, 'indexAdd',
+          compareCPUAndCUDATypeTensorArgsWithConvInternal(typename, gpu2cpu_map, false, nil, x, 'indexAdd',
                                                   index, longIndex, src)
       end
    end
@@ -3593,43 +3775,482 @@ function test.sort()
    tester:assert(isEqual(gather_cpu, gather_gpu), 'indices mismatch')
 end
 
+local function explore(typename, func, t, topk, indices)
+   if t:nDimension() == 1 then
+      func(typename, t, topk, indices)
+   else
+      for i = 1, t:size(1) do
+         explore(typename, func, t[i], topk[i], indices[i])
+      end
+   end
+end
+
 function test.topk()
-   local function runTopK(t, dim, k, dir)
-      -- FIXME: if the tensors ever contain equivalent values, then their indices
-      -- could in fact be different.
+   -- need to ensure unique values for index checking, so for the first pass we create Tensors
+   -- with sizes less than the maximum range of values for that type
+   local counts = {}
+   counts['torch.CudaByteTensor'] = 255
+   counts['torch.CudaCharTensor'] = 255
+   counts['torch.CudaShortTensor'] = 65536
+   counts['torch.CudaIntTensor'] = 2 ^ 20
+   counts['torch.CudaTensor'] = 2 ^ 20
+   counts['torch.CudaLongTensor'] = 2 ^ 20
+   counts['torch.CudaDoubleTensor'] =  2 ^ 20
+   counts['torch.CudaHalfTensor'] = 32768
 
-      if torch.Tensor.type(t) == 'torch.CudaTensor' then
-         return t:topk(k, dim, dir, true)
-      else
-         local sorted, indices = t:sort(dim, dir)
-         return sorted:narrow(dim, 1, k), indices:narrow(dim, 1, k)
+   for _, typename in ipairs(typenames) do
+      for tries = 1, 5 do
+         local t = createTestTensor(counts[typename]):type(typename)
+         local dim = chooseInt(1, t:nDimension())
+         local dimSize = t:size(dim)
+         local dir = chooseInt(1, 2) == 1
+
+         -- Test boundary conditions
+         local kTests = {1, dimSize}
+
+         -- and some other random ones
+         table.insert(kTests, chooseInt(1, dimSize))
+         for i = 1, 2 do
+            -- some sizes that fit in our inplace kernel range (the dimSize one
+            -- will fall back to Thrust)
+            table.insert(kTests, chooseInt(1, math.min(2048, dimSize)))
+         end
+
+         for k = 1, #kTests do
+            compareCPUAndCUDATypeTensorArgsWithLimit(typename, nil, 1, t, 'topk', kTests[k], dim, dir, true)
+
+            -- verify that indices picked yield topk value in original tensor
+            local topk, indices = t:topk(kTests[k], dim, dir, true)
+            local verify = function(typename, t, topk, indices)
+               t = t:type(t2cpu[typename])
+               indices = indices:long()
+               topk = topk:type(t2cpu[typename])
+               for i = 1, indices:size(1) do
+                  tester:assert(t[indices[i]] == topk[i])
+               end
+            end
+
+            local tt  = t:transpose(dim, t:nDimension())
+            local ttk = topk:transpose(dim, topk:nDimension())
+            local tti = indices:transpose(dim, indices:nDimension())
+
+            explore(typename, verify, tt, ttk, tti)
+         end
       end
    end
+end
 
-   for tries = 1, 5 do
-      -- max size 2^20 for indexing
-      local t = createTestTensor(2 ^ 20)
-      local dim = chooseInt(1, t:nDimension())
-      local dimSize = t:size(dim)
-      local dir = chooseInt(1, 2) == 1
+local function verifyMode1D(tensor)
+   -- We cannot rely upon comparing against CPU-Torch as the way it resolves
+   -- ties between equal modes and how it picks the corresponding index is not
+   -- reliable. Instead we will use apply macros to compute the mode in place in
+   -- our code and compare against these results
+
+   -- counts is a table of tensor element -> # of occurrences
+   local counts = {}
+
+   -- populate counts by iterating over the elements in the tensor
+   tensor:apply(function(x) if counts[x] == nil then counts[x] = 1 else counts[x] = counts[x] + 1 end return x end)
+
+   -- next, calculate the max occurrence in the tensor
+   local max = -1;
+   for _, count in pairs(counts) do
+      if count > max then max = count end
+   end
+
+   -- now verify for all the GPU types that 1. the mode picked has max occurrences,
+   -- and 2. that the index returned contains that mode
+
+   for _, cudaType in ipairs(typenames) do
+      local baseType = t2cpu[cudaType]
+      assert(baseType, 'Cannot find baseType for ' .. cudaType)
+      local x_cpu = tensor:clone():type(baseType)
+      local x_cuda = cloneExactlyToGPUType(x_cpu, nil, t2gpu)
+
+      local modes, indices = x_cuda:mode()
+
+      -- 1D, so should only be a single return
+      tester:assert(modes:nElement() == 1, 'mode returned an invalid number of values')
+      tester:assert(indices:nElement() == 1, 'mode returned an invalid number of values')
+      local mode = modes[1]
+      local index = indices[1]
+
+      tester:assert(counts[mode] == max, string.format(
+         'Type: %s --> Selected mode of %s which has count of %s, but mode must have %s occurrences',
+         cudaType, tostring(mode), tostring(counts[mode]), tostring(max)
+      ))
+      tester:assert(tensor[index] == mode, string.format(
+        'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+        cudaType, tostring(index), tostring(tensor[index]), tostring(mode)
+      ))
+   end
+end
 
-      -- Test boundary conditions
-      local kTests = {1, dimSize}
+local function assertSize(tensor, sizes)
+   local valid = true
+   if tensor:nDimension() ~= #sizes then
+      tester:assert(false, 'tensor dimension mismatch')
+   end
+   for i, size in ipairs(sizes) do
+      if tensor:size(i) ~= size then
+         valid = false
+      end
+   end
+   tester:assert(valid, 'tensor size mismatch')
+end
+
+local function verifyMode2D(tensor, onlyDim)
+   local dims = {}
+   if onlyDim ~= nil then
+      dims = {onlyDim}
+   else
+      dims = {1, 2}
+   end
+
+   for _, dim in ipairs(dims) do
+      -- In the case of a 2D Tensor, we need to calculate the count for each slice
+      -- sCounts is a table containing the counts of elements for each slice,
+      -- sMax is a table containing the max occurrence for each slice
+      local sCounts = {}
+      local sMax = {}
+
+      -- First, we use the :split() function to split the Tensor
+      -- Suppose we are mode'ing a 5x10 Tensor. If we mode along dim=1,
+      -- we have a result Tensor that is 1x10, so we need the counts for
+      -- all 10 slices of size=5. So we actually split along dim=2, with
+      -- size = 1, to yield 10 5x1 tensors
+      local splits = tensor:split(1, dim == 1 and 2 or 1)
+
+      -- next, we iterate over these split Tensors to calculate the mode, as we
+      -- did in the 1D case
+      for i, slice in pairs(splits) do
+         local counts = {}
+         slice:apply(function(x) if counts[x] == nil then counts[x] = 1 else counts[x] = counts[x] + 1 end return x end)
+
+         local max = -1;
+         for _, count in pairs(counts) do
+            if count > max then max = count end
+         end
 
-      -- and some other random ones
-      table.insert(kTests, chooseInt(1, dimSize))
-      for i = 1, 2 do
-         -- some sizes that fit in our inplace kernel range (the dimSize one
-         -- will fall back to Thrust)
-         table.insert(kTests, chooseInt(1, math.min(2048, dimSize)))
+         sCounts[i] = counts;
+         sMax[i] = max;
       end
 
-      for k = 1, #kTests do
-         compareFloatAndCuda(t, runTopK, dim, kTests[k], dir)
+      -- verification pass
+      for _, cudaType in ipairs(typenames) do
+         local baseType = t2cpu[cudaType]
+         assert(baseType, 'Cannot find baseType for ' .. cudaType)
+         local x_cpu = tensor:clone():type(baseType)
+         local x_cuda = cloneExactlyToGPUType(x_cpu, nil, t2gpu)
+         local modes, indices = x_cuda:mode(dim)
+
+         -- 2D, so expect:
+         -- (dim = 1) a 1xsize(tensor, dim = 2) tensor
+         -- (dim = 2) a size(tensor, dim = 1)x1 tensor
+
+         if dim == 1 then
+            assertSize(modes, {1, tensor:size(2)})
+            assertSize(indices, {1, tensor:size(2)})
+         else
+            assertSize(modes, {tensor:size(1), 1})
+            assertSize(indices, {tensor:size(1), 1})
+         end
+
+         -- we need to run through and verify that all of the modes/indices are valid, for
+         -- the results of each slice. First, we squeeze the Tensor, so we can iterate over
+         -- both the 1D/2D values in the same manner
+         modes = modes:squeeze(dim)
+         indices = indices:squeeze(dim)
+
+         -- iterate over each slice, and verify that for each slice the mode selected has
+         -- max occurrences, and the index points to the mode
+         for i, counts in pairs(sCounts) do
+            local max = sMax[i]
+            local mode = modes[i]
+            local index = indices[i]
+
+            tester:assert(counts[mode] == max, string.format(
+               'Type: %s --> Selected mode of %s which has count of %s, but mode must have %s occurrences',
+               cudaType, tostring(mode), tostring(counts[mode]), tostring(max)
+            ))
+
+            if dim == 1 then
+               tester:assert(tensor[index][i] == mode, string.format(
+                  'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+                  cudaType, tostring(index), tostring(tensor[index][i]), tostring(mode)
+               ))
+            else
+               tester:assert(tensor[i][index] == mode, string.format(
+                  'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+                  cudaType, tostring(index), tostring(tensor[i][index]), tostring(mode)
+               ))
+            end
+         end
       end
    end
 end
 
+local function verifyMode3D(tensor, onlyDim)
+    local dims = {}
+    if onlyDim ~= nil then
+       dims = {onlyDim}
+    else
+       dims = {1, 2, 3}
+    end
+    -- In the case of 3D Tensor, we need to calculate the count for each slice,
+    -- but this time, we have two layers of depth, for each of the non-mode dims
+    -- so sCounts is a multi-level table where sCounts[i][j] is the counts for
+    -- (_, i, j), (i, _, j) or (i, j, _) depending on the dim
+    local sCounts = {}
+    local sMax = {}
+
+    -- Suppose we have a 2x3x4 Tensor T:
+    -- (1, .., ..),    (2, .., ..)
+    -- [1, 2, 3, 4]    [3, 2, 2, 4]
+    -- [5, 6, 7, 8]    [5, 6, 8, 7]
+    -- [9, 10, 11, 12] [1, 10, 11, 1]
+    --
+    -- Then for dim = 1, we need counts to be a multi-level table (3x4xcounts)
+    --                2                                           (2x4xcounts)
+    --                3                                           (2x3xcounts)
+    --
+    -- Results: dim = 1
+    -- {1:
+    --    {1:
+    --       1 --> 1,
+    --       3 --> 1,
+    --     2:
+    --       2 --> 2,
+    --     3:
+    --       2 --> 1,
+    --       3 --> 1,
+    --     4:
+    --       4 --> 2,
+    --    },
+    -- {2:
+    --    {1:
+    --       5 --> 2,
+    --       ...
+
+    -- used to set loop bounds and indexing to construct the above table using the loop below
+    local dbounds = {
+      {tensor:size(2), tensor:size(3), tensor:size(1)},
+      {tensor:size(1), tensor:size(3), tensor:size(2)},
+      {tensor:size(1), tensor:size(2), tensor:size(3)}}
+    local dfuncs = {
+      function(tensor, i, j, k) return tensor[k][i][j] end,
+      function(tensor, i, j, k) return tensor[i][k][j] end,
+      function(tensor, i, j, k) return tensor[i][j][k] end,
+    }
+
+    -- loop...
+    for d, bounds in ipairs(dbounds) do
+      sCounts[d] = {}
+      sMax[d] = {}
+      for i = 1, bounds[1] do
+        sCounts[d][i] = {}
+        sMax[d][i] = {}
+        for j = 1, bounds[2] do
+           sCounts[d][i][j] = {}
+           sMax[d][i][j] = {}
+           for k = 1, bounds[3] do
+             local v = dfuncs[d](tensor, i, j, k)
+             if sCounts[d][i][j][v] == nil then
+                sCounts[d][i][j][v] = 1
+             else
+                sCounts[d][i][j][v] = sCounts[d][i][j][v] + 1
+             end
+
+             local max = -1
+             for _, count in pairs(sCounts[d][i][j]) do
+                if count > max then max = count end
+             end
+             sMax[d][i][j] = max
+           end -- k
+        end -- k
+      end -- j
+    end -- d
+
+
+   -- verification pass
+   for _, dim in ipairs(dims) do
+      for _, cudaType in ipairs(typenames) do
+         local baseType = t2cpu[cudaType]
+         assert(baseType, 'Cannot find baseType for ' .. cudaType)
+         local x_cpu = tensor:clone():type(baseType)
+         local x_cuda = cloneExactlyToGPUType(x_cpu, nil, t2gpu)
+         local modes, indices = x_cuda:mode(dim)
+
+         if dim == 1 then
+            assertSize(modes, {1, tensor:size(2), tensor:size(3)})
+            assertSize(indices, {1, tensor:size(2), tensor:size(3)})
+         elseif dim == 2 then
+            assertSize(modes, {tensor:size(1), 1, tensor:size(3)})
+            assertSize(indices, {tensor:size(1), 1, tensor:size(3)})
+         else
+            assertSize(modes, {tensor:size(1), tensor:size(2), 1})
+            assertSize(indices, {tensor:size(1), tensor:size(2), 1})
+         end
+
+         -- squeeze on mode dim
+         modes = modes:squeeze(dim)
+         indices = indices:squeeze(dim)
+
+         -- iterate over slices
+         for i, js in pairs(sCounts[dim]) do
+            for j, counts in pairs(js) do
+               local max = sMax[dim][i][j]
+               local mode = modes[i][j]
+               local index = indices[i][j]
+
+               tester:assert(counts[mode] == max, string.format(
+                  'Type: %s --> Selected mode of %s which has count of %s, but mode must have %s occurrences',
+                  cudaType, tostring(mode), tostring(counts[mode]), tostring(max)
+               ))
+
+               if dim == 1 then
+                  tester:assert(tensor[index][i][j] == mode, string.format(
+                     'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+                     cudaType, tostring(index), tostring(tensor[index][i][j]), tostring(mode)
+                  ))
+               elseif dim == 2 then
+                  tester:assert(tensor[i][index][j] == mode, string.format(
+                     'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+                     cudaType, tostring(index), tostring(tensor[i][index][j]), tostring(mode)
+                  ))
+               else
+                  tester:assert(tensor[i][j][index] == mode, string.format(
+                     'Type: %s --> Selected index of %s which has value %s, but mode is %s',
+                     cudaType, tostring(index), tostring(tensor[i][j][index]), tostring(mode)
+                  ))
+               end
+
+            end -- j
+         end --i
+      end -- tensor type
+   end -- dim
+end
+
+function test.mode()
+    -- Tests for 1D Tensors
+
+    -- Single-element Tensor
+    local input = torch.FloatTensor({1})
+    verifyMode1D(input)
+
+    -- Tensor of all the same values
+    local input = torch.FloatTensor(10):fill(1)
+    verifyMode1D(input)
+
+    -- Tensor with a unique range of values
+    local input = torch.FloatTensor({4, 3, 6, 8, 2, 1})
+    verifyMode1D(input)
+
+    -- Handles ties when there are two things with equal counts
+    local input = torch.FloatTensor({2, 2, 1, 1})
+    verifyMode1D(input)
+
+    -- Big Range of Values: (4 is the mode)
+    local input = torch.FloatTensor({
+        1, 4, 4, 4, 4, 1, 1, 2, 2, 2, 3, 4, 5, 5, 4, 4, 4, 4, 4, 4,
+        2, 2, 1, 1, 2, 3, 4, 4, 4, 4, 2, 3, 4, 4, 3, 2, 1, 2, 3, 4})
+    verifyMode1D(input)
+
+    -- Larger Example
+    local input = torch.FloatTensor(1000):apply(function(x) return torch.random(1, 10) end)
+    verifyMode1D(input)
+
+    -- verify input is unchanged
+    local input = torch.FloatTensor({4, 3, 6, 8, 2, 1})
+    local same = torch.FloatTensor({4, 3, 6, 8, 2, 1})
+    torch.mode(input)
+    tester:assertTensorEq(input, same, 0, 'cutorch mode modified input')
+
+    -- Tests for 2D Tensors
+
+    -- Tensor of all the same values
+    local input = torch.FloatTensor(3, 4):fill(1)
+    verifyMode2D(input)
+
+    -- Tensor with a unique range of values
+    local input = torch.FloatTensor({{2,  3,  5, 7},
+                                     {1, 10, 17, 6},
+                                     {0, 22, 14, 9}})
+    verifyMode2D(input)
+
+    -- Consistency between ties when there are two things with equal counts
+    local input = torch.FloatTensor({{2,  2,  3, 3},
+                                     {1,  1,  3, 3},
+                                     {2,  2,  1, 1},
+                                     {1,  1,  1, 1}})
+    verifyMode2D(input)
+
+    -- Larger example
+    local input = torch.FloatTensor(50, 100):apply(function(x) return torch.random(1, 10) end)
+    verifyMode2D(input)
+
+    -- Tests for 3D Tensors
+
+    -- Tensor of all the same values
+    local input = torch.FloatTensor(2, 4, 5):fill(1)
+    verifyMode3D(input)
+
+    -- Tensor with a unique range of values
+    local input = torch.FloatTensor(
+        {
+            {{2,  3,  5, 7},
+             {1, 10, 17, 6},
+             {0, 22, 14, 9}},
+
+            {{32, 88, 25,   4},
+             {21, 78, 57, 111},
+             {15, 68, 64, 117}}
+        }
+    )
+    verifyMode3D(input)
+
+    -- Handles ties when there are two things with equal counts
+    local input = torch.FloatTensor(
+        {
+            {{2,  2,  3, 3},
+             {1,  1,  3, 3},
+             {2,  2,  1, 1},
+             {1,  1,  1, 1}},
+
+            {{3,  3,  4, 4},
+             {2,  2,  4, 4},
+             {3,  3,  2, 2},
+             {2,  2,  2, 2}},
+        }
+    )
+    verifyMode3D(input)
+
+    -- Larger example
+    local input = torch.FloatTensor(14, 22, 32):apply(function(x) return torch.random(1, 10) end)
+    verifyMode3D(input)
+end
+
+function test.bigmode()
+    -- Examples that overflow fused-kernel
+    local input = torch.IntTensor(16384):apply(function(x) return torch.random(1, 100) end)
+    verifyMode1D(input)
+
+    local input = torch.FloatTensor(4096, 4):fill(1)
+    verifyMode2D(input, 1)
+
+    local input = torch.FloatTensor(4, 4096):fill(1)
+    verifyMode2D(input, 2)
+
+    local input = torch.FloatTensor(2, 2, 4096):fill(1)
+    verifyMode3D(input, 3)
+
+    local input = torch.FloatTensor(2, 4096, 2):fill(1)
+    verifyMode3D(input, 2)
+
+    local input = torch.FloatTensor(4096, 2, 2):fill(1)
+    verifyMode3D(input, 1)
+end
+
 function test.cat()
    for k, typename in ipairs(typenames) do
       for dim = 1, 3 do
@@ -3661,6 +4282,32 @@ function test.cat()
    end
 end
 
+function test.catNoDim()
+   for k, typename in ipairs(typenames) do
+      local a
+      local b
+      local c
+
+      a = torch.Tensor(minsize):uniform():type(typename)
+      b = torch.Tensor(minsize):uniform():type(typename)
+      c = torch.cat(a, b)
+      tester:assertTensorEq(c:narrow(1, 1, minsize), a, 0, 'torch.cat value')
+      tester:assertTensorEq(c:narrow(1, minsize + 1, minsize), b, 0, 'torch.cat value')
+
+      a = torch.Tensor(1, minsize):uniform():type(typename)
+      b = torch.Tensor(1, minsize):uniform():type(typename)
+      c = torch.cat(a, b)
+      tester:assertTensorEq(c:narrow(2, 1, minsize), a, 0, 'torch.cat value')
+      tester:assertTensorEq(c:narrow(2, minsize + 1, minsize), b, 0, 'torch.cat value')
+
+      a = torch.Tensor(10, minsize):uniform():type(typename)
+      b = torch.Tensor(10, minsize):uniform():type(typename)
+      c = torch.cat(a, b)
+      tester:assertTensorEq(c:narrow(2, 1, minsize), a, 0, 'torch.cat value')
+      tester:assertTensorEq(c:narrow(2, minsize + 1, minsize), b, 0, 'torch.cat value')
+   end
+end
+
 function test.catArray()
    for k, typename in ipairs(typenames) do
       for dim = 1, 3 do
@@ -4157,7 +4804,7 @@ function test.kernelP2PAccess()
    end
 end
 
-if os.getenv('THC_CACHING_ALLOCATOR') == '1' then
+if os.getenv('THC_CACHING_ALLOCATOR') ~= '0' then
    local function getCyclesPerMs()
       cutorch.synchronize()
       local t = torch.Timer()
@@ -4170,8 +4817,8 @@ if os.getenv('THC_CACHING_ALLOCATOR') == '1' then
       local cyclesPerMs = getCyclesPerMs()
 
       -- check that allocations are re-used after deletion
-      t = cutorch.createCudaHostTensor({1})
-      ptr = t:data()
+      local t = cutorch.createCudaHostTensor({1})
+      local ptr = t:data()
       t = nil; collectgarbage()
       t = cutorch.createCudaHostTensor({1})
       tester:asserteq(t:data(), ptr, 'allocation not reused')
@@ -4184,6 +4831,31 @@ if os.getenv('THC_CACHING_ALLOCATOR') == '1' then
       t = cutorch.createCudaHostTensor({1})
       tester:assertne(t:data(), ptr, 'allocation re-used too soon')
    end
+
+   function test.cachedPinnedMemoryMultiGPU()
+      local device_count = cutorch.getDeviceCount()
+      if device_count < 2 then
+         return
+      end
+
+      local cyclesPerMs = getCyclesPerMs()
+      local t = cutorch.createCudaHostTensor(1)
+      local ptr = t:data()
+      t[1] = 1
+
+      local gpu_tensor1 = torch.CudaTensor({0})
+
+      cutorch.setDevice(2)
+      local gpu_tensor2 = torch.CudaTensor({0})
+      cutorch._sleep(50 * cyclesPerMs)  -- delay the copy
+      gpu_tensor2:copyAsync(t)
+
+      cutorch.setDevice(1)
+      t = nil; collectgarbage();
+      t = cutorch.createCudaHostTensor(1)
+      tester:assertne(t:data(), ptr, 'allocation re-used too soon')
+   end
+
 end
 
 -- unfortunately, torch.Tester() forgot setUp and tearDown functions.
diff --git a/torch/utils.h b/torch/utils.h
index ae959b7..8d3c455 100644
--- a/torch/utils.h
+++ b/torch/utils.h
@@ -26,7 +26,7 @@
 # define TORCH_API TORCH_EXTERNC
 #endif
 
-#if LUA_VERSION_NUM == 501
+#ifndef HAS_LUAL_SETFUNCS
 /*
 ** Adapted from Lua 5.2.0
 */

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-cutorch.git