[lua-torch-torch7] 01/05: New upstream version 0~20170304-g329dff5

Tue Mar 14 08:37:52 UTC 2017

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-torch7.

commit cb2297ab010e6e3e759b6a6d580866d99be74407
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Tue Mar 14 08:13:25 2017 +0000

    New upstream version 0~20170304-g329dff5
---
 TensorMath.lua                    |  60 +++
 doc/maths.md                      | 213 +++++++++
 generic/Tensor.c                  |   1 -
 lib/TH/CMakeLists.txt             | 220 +++++----
 lib/TH/README.md                  |   7 +
 lib/TH/THGeneral.h.in             |   7 -
 lib/TH/THStorage.c                |  53 +++
 lib/TH/THStorage.h                |   8 +
 lib/TH/THTensor.c                 |   1 +
 lib/TH/THTensor.h                 |   5 -
 lib/TH/THTensorApply.h            | 554 ++++++---------------
 lib/TH/THVector.c                 |   8 +
 lib/TH/cmake/FindMKL.cmake        |   2 +-
 lib/TH/cmake/FindSSE.cmake        |  16 +-
 lib/TH/generic/THTensor.c         | 155 +++---
 lib/TH/generic/THTensor.h         |   8 +-
 lib/TH/generic/THTensorConv.c     |  10 +-
 lib/TH/generic/THTensorCopy.c     |  13 +-
 lib/TH/generic/THTensorMath.c     | 978 +++++++++++++++++++++++++++-----------
 lib/TH/generic/THTensorMath.h     |  10 +
 lib/TH/generic/THVector.h         |  11 +-
 lib/TH/generic/THVectorDefault.c  | 105 ++--
 lib/TH/generic/THVectorDispatch.c | 171 +++++--
 lib/TH/generic/simd/convolve.c    |   6 +-
 lib/TH/generic/simd/simd.h        |  36 +-
 lib/TH/vector/AVX.c               | 274 +++++++++++
 lib/TH/vector/AVX.h               |  23 +
 lib/TH/vector/AVX2.c              |  47 ++
 lib/TH/vector/AVX2.h              |   9 +
 lib/TH/vector/NEON.c              |  81 ++--
 lib/TH/vector/SSE.c               | 259 ++++++----
 test/test.lua                     | 749 ++++++++++++++++++++---------
 32 files changed, 2829 insertions(+), 1271 deletions(-)

diff --git a/TensorMath.lua b/TensorMath.lua
index d816740..53838ae 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -311,6 +311,18 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
          {name=Tensor, method={default=1}},
          {name=real}})
 
+   wrap("lshift",
+        cname("lshift"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real}})
+
+   wrap("rshift",
+        cname("rshift"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real}})
+
    wrap("fmod",
         cname("fmod"),
         {{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -323,6 +335,24 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
          {name=Tensor, method={default=1}},
          {name=real}})
 
+   wrap("bitand",
+        cname("bitand"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real}})
+
+   wrap("bitor",
+        cname("bitor"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real}})
+
+   wrap("bitxor",
+        cname("bitxor"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real}})
+
    -- mod alias
    wrap("mod",
         cname("fmod"),
@@ -364,6 +394,18 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
          {name=Tensor, method={default=1}},
          {name=Tensor}})
 
+   wrap("clshift",
+        cname("clshift"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}})
+
+   wrap("crshift",
+        cname("crshift"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}})
+
    wrap("cfmod",
         cname("cfmod"),
         {{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -376,6 +418,24 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
          {name=Tensor, method={default=1}},
          {name=Tensor}})
 
+   wrap("cbitand",
+        cname("cbitand"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}})
+
+   wrap("cbitor",
+        cname("cbitor"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}})
+
+   wrap("cbitxor",
+        cname("cbitxor"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}})
+
    -- cmod alias
    wrap("cmod",
         cname("cfmod"),
diff --git a/doc/maths.md b/doc/maths.md
index b4f1592..eb9f5cf 100755
--- a/doc/maths.md
+++ b/doc/maths.md
@@ -860,6 +860,87 @@ The number of elements must match, but sizes do not matter.
 
 `z:cdiv(x, y)` puts the result in `z`.
 
+<a name="torch.lshift"></a>
+### [res] torch.lshift([res,] tensor, value) ###
+<a name="torch.lshift"></a>
+
+Left shift all elements in the `Tensor` by the given `value`.
+
+`z = torch.lshift(x, 2)` will return a new `Tensor` with the result of `x << 2`.
+
+`torch.lshift(z, x, 2)` will put the result of `x << 2` in `z`.
+
+`x:lshift(2)` will perform left shift operation all elements of `x` by `2` bits.
+
+`z:lshift(x, 2)` puts the result of `x << 2` in `z`.
+
+Note: For float type tensors, `x:lshift(value)` evaluates `x:mul(math.pow(2, value))` internally.
+
+<a name="torch.clshift"></a>
+### [res] torch.clshift([res,] tensor1, tensor2) ###
+<a name="torch.clshift"></a>
+
+Performs the left shift operation of each element in `tensor1` by each element in `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.LongTensor(2, 2):fill(1)
+> y = torch.LongTensor(2, 2):range(1, 4)
+> x:clshift(y)
+> x
+ 2  4
+ 8 16
+[torch.LongTensor of size 2x2]
+```
+
+`z = torch.clshift(x, y)` returns a new `Tensor`.
+
+`torch.clshift(z, x, y)` puts the result in `z`.
+
+`y:clshift(x)` left shifts all elements of `y` with corresponding elements of `x`.
+
+`z:clshift(x, y)` puts the result in `z`.
+
+<a name="torch.rshift"></a>
+### [res] torch.rshift([res,] tensor, value) ###
+<a name="torch.rshift"></a>
+
+Right shift all elements in the `Tensor` by the given `value`.
+
+`z = torch.rshift(x, 2)` will return a new `Tensor` with the result of `x >> 2`.
+
+`torch.rshift(z, x, 2)` will put the result of `x >> 2` in `z`.
+
+`x:rshift(2)` will perform right shift operation all elements of `x` by `2` bits.
+
+`z:rshift(x, 2)` puts the result of `x >> 2` in `z`.
+
+Note: For float type tensors, `x:lshift(value)` evaluates `x:div(math.pow(2, value))` internally.
+
+<a name="torch.crshift"></a>
+### [res] torch.crshift([res,] tensor1, tensor2) ###
+<a name="torch.crshift"></a>
+
+Performs the right shift operation of each element in `tensor1` by each element in `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.LongTensor(2, 2):fill(32)
+> y = torch.LongTensor(2, 2):range(1, 4)
+> x:crshift(y)
+> x
+ 16 8
+  4 2
+[torch.LongTensor of size 2x2]
+```
+
+`z = torch.crshift(x, y)` returns a new `Tensor`.
+
+`torch.crshift(z, x, y)` puts the result in `z`.
+
+`y:crshift(x)` right shifts all elements of `y` with corresponding elements of `x`.
+
+`z:crshift(x, y)` puts the result in `z`.
 
 <a name="torch.addcdiv"></a>
 ### [res] torch.addcdiv([res,] x [,value], tensor1, tensor2) ###
@@ -1006,6 +1087,138 @@ corresponding elements of `x`.
 
 This function is deprecated and exists only for compatibility with previous versions. Please use `torch.cfmod()` or `torch.cremainder()` instead.
 
+<a name="torch.bitand"></a>
+### [res] torch.bitand([res,] tensor, value) ###
+<a name="torch.bitand"></a>
+
+Performs bitwise `and` operation on all elements in the `Tensor` by the given `value`.
+
+`z = torch.bitand(x, value)` will return a new `Tensor` with the result of `x & value`.
+
+`torch.bitand(z, x, value)` will put the result of `x & value` in `z`.
+
+`x:bitand(value)` will perform right shift operation all elements of `x` by `value` bits.
+
+`z:bitand(x, value)` puts the result of `x & value` in `z`.
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
+
+<a name="torch.cbitand"></a>
+### [res] torch.cbitand([res,] tensor1, tensor2) ###
+<a name="torch.cbitand"></a>
+
+Performs bitwise `and` operation of each element in `tensor1` by each element in `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.LongTensor(4):fill(6)
+> y = torch.LongTensor{1, 2, 4, 8}
+> x:cbitand(y)
+> x
+  0
+  2
+  4
+  0
+[torch.LongTensor of size 4]
+```
+`z = torch.cbitand(x, y)` returns a new `Tensor`.
+
+`torch.cbitand(z, x, y)` puts the result in `z`.
+
+`y:cbitand(x)` performs bitwise `and` all elements of `y` with corresponding elements of `x`.
+
+`z:cbitand(x, y)` puts the result in `z`.
+
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
+
+<a name="torch.bitor"></a>
+### [res] torch.bitor([res,] tensor, value) ###
+<a name="torch.bitor"></a>
+
+Performs bitwise `or` operation on all elements in the `Tensor` by the given `value`.
+
+`z = torch.bitor(x, value)` will return a new `Tensor` with the result of `x & value`.
+
+`torch.bitor(z, x, value)` will put the result of `x | value` in `z`.
+
+`x:bitor(value)` will perform right shift operation all elements of `x` by `value` bits.
+
+`z:bitor(x, value)` puts the result of `x | value` in `z`.
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
+
+<a name="torch.cbitor"></a>
+### [res] torch.cbitor([res,] tensor1, tensor2) ###
+<a name="torch.cbitor"></a>
+
+Performs bitwise `or` operation of each element in `tensor1` by each element in `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.LongTensor(4):fill(3)
+> y = torch.LongTensor{1, 2, 4, 8}
+> x:cbitor(y)
+> x
+  3
+  3
+  7
+ 11
+[torch.LongTensor of size 4]
+```
+`z = torch.cbitor(x, y)` returns a new `Tensor`.
+
+`torch.cbitor(z, x, y)` puts the result in `z`.
+
+`y:cbitor(x)` performs bitwise `or` all elements of `y` with corresponding elements of `x`.
+
+`z:cbitor(x, y)` puts the result in `z`.
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
+
+<a name="torch.bitxor"></a>
+### [res] torch.bitxor([res,] tensor, value) ###
+<a name="torch.bitxor"></a>
+
+Performs bitwise `xor` operation on all elements in the `Tensor` by the given `value`.
+
+`z = torch.bitxor(x, value)` will return a new `Tensor` with the result of `x & value`.
+
+`torch.bitxor(z, x, value)` will put the result of `x ^ value` in `z`.
+
+`x:bitxor(value)` will perform right shift operation all elements of `x` by `value` bits.
+
+`z:bitxor(x, value)` puts the result of `x ^ value` in `z`.
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
+
+<a name="torch.cbitxor"></a>
+### [res] torch.cbitxor([res,] tensor1, tensor2) ###
+<a name="torch.cbitxor"></a>
+
+Performs bitwise `xor` operation of each element in `tensor1` by each element in `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.LongTensor(4):fill(15)
+> y = torch.LongTensor{1, 2, 4, 8}
+> x:cbitxor(y)
+> x
+  14
+  13
+  11
+   7
+[torch.LongTensor of size 4]
+```
+`z = torch.cbitxor(x, y)` returns a new `Tensor`.
+
+`torch.cbitxor(z, x, y)` puts the result in `z`.
+
+`y:cbitxor(x)` performs bitwise `xor` all elements of `y` with corresponding elements of `x`.
+
+`z:cbitxor(x, y)` puts the result in `z`.
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
 
 <a name="torch.dot"></a>
 ### [number] torch.dot(tensor1, tensor2) ###
diff --git a/generic/Tensor.c b/generic/Tensor.c
index c2417fe..aabbbdc 100644
--- a/generic/Tensor.c
+++ b/generic/Tensor.c
@@ -1355,7 +1355,6 @@ void torch_Tensor_(init)(lua_State *L)
 #ifndef TH_REAL_IS_HALF
   THVector_(vectorDispatchInit)();
 #endif
-
 }
 
 #endif
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index 3f66edc..8aeb204 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -20,11 +20,21 @@ IF(NOT TH_INSTALL_BIN_SUBDIR
   SET(TH_INSTALL_CMAKE_SUBDIR "share/cmake/TH" CACHE PATH "TH install cmake subdirectory")
 ENDIF()
 
-# flags
+#######################################################################
+##### flags section
+######################################################################
 
 IF(MSVC)
-  # respect the standard
-  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+  # MSVC now supports C99 since VS2013/VS2015
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /std:c99")
+ELSE(MSVC)
+  # enable gnu99 and not c99 because we use
+  # gnu extensions like posix_memalign
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu99")
+ENDIF(MSVC)
+
+IF(MSVC)
+  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)  # respect the standard
 ENDIF(MSVC)
 
 IF(UNIX)
@@ -95,70 +105,46 @@ IF(HAVE_GCC_GET_CPUID)
   SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_GCC_GET_CPUID")
 ENDIF(HAVE_GCC_GET_CPUID)
 
-FIND_PACKAGE(SSE)
+CHECK_C_SOURCE_COMPILES("#include <stdint.h>
+    static inline void cpuid(uint32_t *eax, uint32_t *ebx,
+    			 uint32_t *ecx, uint32_t *edx)
+    {
+      uint32_t a = *eax, b, c = *ecx, d;
+      asm volatile ( \"cpuid\" : \"+a\"(a), \"=b\"(b), \"+c\"(c), \"=d\"(d) );
+      *eax = a; *ebx = b; *ecx = c; *edx = d;
+    }
+    int main() {
+      uint32_t a,b,c,d;
+      cpuid(&a, &b, &c, &d);
+      return 0;
+    }" NO_GCC_EBX_FPIC_BUG)
+
+IF(NOT NO_GCC_EBX_FPIC_BUG)
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GCC_GET_CPUID")
+ENDIF(NOT NO_GCC_EBX_FPIC_BUG)
+
+
+FIND_PACKAGE(SSE) # checks SSE, AVX and AVX2
 IF(C_SSE2_FOUND)
+  MESSAGE(STATUS "SSE2 Found")
   SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}")
 ENDIF(C_SSE2_FOUND)
 IF(C_SSE3_FOUND)
+  MESSAGE(STATUS "SSE3 Found")
   SET(CMAKE_C_FLAGS "${C_SSE3_FLAGS} -DUSE_SSE3 ${CMAKE_C_FLAGS}")
 ENDIF(C_SSE3_FOUND)
-
-IF(C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
-  SET(simd generic/simd/convolve.c)
-  IF(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve.c PROPERTIES COMPILE_FLAGS "/std:c99")
-  ELSE(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve.c PROPERTIES COMPILE_FLAGS "-std=c99")
-  ENDIF(MSVC)
-ENDIF(C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
-
-IF(C_SSE4_1_FOUND)
-  SET(CMAKE_C_FLAGS "${C_SSE4_1_FLAGS} -DUSE_SSE4_1 ${CMAKE_C_FLAGS}")
-ENDIF(C_SSE4_1_FOUND)
-IF(C_SSE4_2_FOUND)
-  SET(CMAKE_C_FLAGS "${C_SSE4_2_FLAGS} -DUSE_SSE4_2 ${CMAKE_C_FLAGS}")
-ENDIF(C_SSE4_2_FOUND)
-
-IF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND)
-  IF(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast /std:c99")
-  ELSE(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math -std=c99")
-  ENDIF(MSVC)
-  SET(simd ${simd} generic/simd/convolve5x5_sse.c)
-ENDIF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND)
-
+# we dont set -mavx and -mavx2 flags globally, but only for specific files
+# however, we want to enable the AVX codepaths, so we still need to
+# add USE_AVX and USE_AVX2 macro defines
 IF(C_AVX_FOUND)
+  MESSAGE(STATUS "AVX Found")
   SET(CMAKE_C_FLAGS "-DUSE_AVX ${CMAKE_C_FLAGS}")
-  IF(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast /arch:AVX /std:c99")
-  ELSE(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math -mavx -std=c99")
-  ENDIF(MSVC)
-  SET(simd ${simd} generic/simd/convolve5x5_avx.c)
 ENDIF(C_AVX_FOUND)
+IF(C_AVX2_FOUND)
+  MESSAGE(STATUS "AVX2 Found")
+  SET(CMAKE_C_FLAGS "-DUSE_AVX2 ${CMAKE_C_FLAGS}")
+ENDIF(C_AVX2_FOUND)
 
-SET(hdr
-  THGeneral.h THHalf.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
-  THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h )
-
-SET(src
-  THGeneral.c THHalf.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
-  THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
-
-SET(src ${src} ${hdr} ${simd})
-ADD_LIBRARY(TH SHARED ${src})
-if(BUILD_STATIC)
-  ADD_LIBRARY(TH_static STATIC ${src})
-endif()
-
-IF(NOT TH_SO_VERSION)
-  SET(TH_SO_VERSION 0)
-ENDIF(NOT TH_SO_VERSION)
-MESSAGE(STATUS "TH_SO_VERSION: ${TH_SO_VERSION}")
-SET_TARGET_PROPERTIES(TH PROPERTIES
-  VERSION   ${TH_SO_VERSION}
-  SOVERSION ${TH_SO_VERSION})
 
 CHECK_C_SOURCE_RUNS("
 #include <stdatomic.h>
@@ -202,6 +188,74 @@ int main()
 " HAS_GCC_ATOMICS)
 ENDIF()
 
+#######################################################################
+##### sources section
+######################################################################
+
+# IF ANY SIMD FOUND
+IF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
+  SET(simd generic/simd/convolve.c)
+ENDIF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
+
+# IF SSE4 FOUND
+IF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
+  SET(CMAKE_C_FLAGS "${C_SSE4_1_FLAGS} -DUSE_SSE4_1 ${C_SSE4_2_FLAGS} -DUSE_SSE4_2 ${CMAKE_C_FLAGS}")
+  IF(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast")
+  ELSE(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math")
+  ENDIF(MSVC)
+  SET(simd ${simd} generic/simd/convolve5x5_sse.c)
+ENDIF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
+
+# IF AVX FOUND
+IF(C_AVX_FOUND)
+  IF(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast ${C_AVX_FLAGS}")
+    SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "/Ox ${C_AVX_FLAGS}")
+  ELSE(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math ${C_AVX_FLAGS}")
+    SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX_FLAGS}")
+  ENDIF(MSVC)
+  SET(simd ${simd} vector/AVX.c generic/simd/convolve5x5_avx.c)
+ENDIF(C_AVX_FOUND)
+
+IF(C_AVX2_FOUND)
+  IF(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "/Ox ${C_AVX2_FLAGS}")
+  ELSE(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX2_FLAGS}")
+  ENDIF(MSVC)
+  SET(simd ${simd} vector/AVX2.c)
+ENDIF(C_AVX2_FOUND)
+
+SET(hdr
+  THGeneral.h THHalf.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
+  THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h )
+
+SET(src
+  THGeneral.c THHalf.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
+  THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
+
+SET(src ${src} ${hdr} ${simd})
+
+#######################################################################
+##### build section
+######################################################################
+
+ADD_LIBRARY(TH SHARED ${src})
+if(BUILD_STATIC)
+  ADD_LIBRARY(TH_static STATIC ${src})
+endif()
+
+IF(NOT TH_SO_VERSION)
+  SET(TH_SO_VERSION 0)
+ENDIF(NOT TH_SO_VERSION)
+MESSAGE(STATUS "TH_SO_VERSION: ${TH_SO_VERSION}")
+SET_TARGET_PROPERTIES(TH PROPERTIES
+  VERSION   ${TH_SO_VERSION}
+  SOVERSION ${TH_SO_VERSION})
+
 IF(HAS_C11_ATOMICS)
   ADD_DEFINITIONS(-DUSE_C11_ATOMICS=1)
   MESSAGE(STATUS "Atomics: using C11 intrinsics")
@@ -233,10 +287,6 @@ IF(LAPACK_FOUND)
   TARGET_LINK_LIBRARIES(TH ${LAPACK_LIBRARIES})
 ENDIF(LAPACK_FOUND)
 
-IF(BLAS_IS_ACCELERATE)
-  MESSAGE(STATUS "BLAS FOUND IS ACCELERATE: Fix for sdot")
-ENDIF()
-
 IF (UNIX AND NOT APPLE)
    INCLUDE(CheckLibraryExists)
    # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830
@@ -253,6 +303,7 @@ IF(UNIX)
   IF(HAVE_MMAP)
     ADD_DEFINITIONS(-DHAVE_MMAP=1)
   ENDIF(HAVE_MMAP)
+  # done for lseek: https://www.gnu.org/software/libc/manual/html_node/File-Position-Primitive.html
   ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
   CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
   IF(HAVE_SHM_OPEN)
@@ -268,47 +319,10 @@ IF(UNIX)
   ENDIF(HAVE_MALLOC_USABLE_SIZE)
 ENDIF(UNIX)
 
-
-
 IF(NOT MSVC)
   TARGET_LINK_LIBRARIES(TH m)
 ENDIF(NOT MSVC)
 
-SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-FOREACH(KEYWORD "inline" "__inline__" "__inline")
-  IF(NOT DEFINED C_INLINE)
-
-    SET(CMAKE_REQUIRED_FLAGS "-Dinline=${KEYWORD} ${CMAKE_C_FLAGS}")
-    CHECK_C_SOURCE_RUNS("
-       static inline int static_foo()
-       {
-         return 0;
-       }
-
-       int main(int argc, char *argv[])
-       {
-         static_foo();
-         return 0;
-       }" C_HAS_${KEYWORD})
-
-    IF(C_HAS_${KEYWORD})
-      SET(C_INLINE TRUE)
-# Right now i put it in THGeneral.h -- debatable
-#      ADD_DEFINITIONS("-Dinline=${KEYWORD}")
-      SET(TH_INLINE ${KEYWORD})
-      MESSAGE(STATUS "C inline is supported (${KEYWORD})")
-    ENDIF(C_HAS_${KEYWORD})
-  ENDIF(NOT DEFINED C_INLINE)
-ENDFOREACH(KEYWORD)
-SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
-
-IF(NOT DEFINED C_INLINE)
-  MESSAGE(STATUS "C inline seems not supported")
-# Right now i put it in THGeneral.h -- debatable
-#  ADD_DEFINITIONS("-Dinline=")
-SET(TH_INLINE "")
-ENDIF(NOT DEFINED C_INLINE)
-
 # Is __thread supported?
 IF(NOT MSVC)
   CHECK_C_SOURCE_COMPILES("static __thread int x = 1; int main() { return x; }" C_HAS_THREAD)
@@ -324,6 +338,11 @@ ENDIF(NOT C_HAS_THREAD)
 INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
 CONFIGURE_FILE(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h")
 
+
+#######################################################################
+##### install section
+######################################################################
+
 INSTALL(TARGETS TH
   EXPORT TH-exports
   RUNTIME DESTINATION "${TH_INSTALL_BIN_SUBDIR}"
@@ -358,6 +377,11 @@ INSTALL(FILES
   DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH")
 
 INSTALL(FILES
+  vector/AVX.h
+  vector/AVX2.h
+  DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH/vector")
+
+INSTALL(FILES
   generic/THBlas.c
   generic/THBlas.h
   generic/THLapack.c
diff --git a/lib/TH/README.md b/lib/TH/README.md
new file mode 100644
index 0000000..c646ce9
--- /dev/null
+++ b/lib/TH/README.md
@@ -0,0 +1,7 @@
+Environment variables control the disabling of certain explicit SIMD optimizations.
+
+```
+TH_NO_AVX2=1 # disable AVX2 codepaths
+TH_NO_AVX=1  # disable AVX codepaths
+TH_NO_SSE=1  # disable SSE codepaths
+```
\ No newline at end of file
diff --git a/lib/TH/THGeneral.h.in b/lib/TH/THGeneral.h.in
index bc7e448..de11f1b 100644
--- a/lib/TH/THGeneral.h.in
+++ b/lib/TH/THGeneral.h.in
@@ -13,7 +13,6 @@
 
 #cmakedefine USE_BLAS
 #cmakedefine USE_LAPACK
-#cmakedefine BLAS_IS_ACCELERATE
 #cmakedefine BLAS_F2C
 
 #ifdef __cplusplus
@@ -32,12 +31,6 @@
 # define TH_API TH_EXTERNC
 #endif
 
-#define TH_INLINE @TH_INLINE@
-
-#ifndef __cplusplus
-#define inline @TH_INLINE@
-#endif
-
 #ifndef M_PI
 # define M_PI 3.14159265358979323846
 #endif
diff --git a/lib/TH/THStorage.c b/lib/TH/THStorage.c
index bb63a43..9c48e77 100644
--- a/lib/TH/THStorage.c
+++ b/lib/TH/THStorage.c
@@ -12,3 +12,56 @@
 
 #include "generic/THStorageCopy.c"
 #include "THGenerateHalfType.h"
+
+
+THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) {
+  const int L = TH_DESC_BUFF_LEN;
+  THDescBuff buf;
+  char *str = buf.str;
+  int n = 0;
+  n += snprintf(str, L-n, "[");
+  int i;
+  for(i = 0; i < size->size; i++) {
+    if(n >= L) break;
+    n += snprintf(str+n, L-n, "%ld", size->data[i]);
+    if(i < size->size-1) {
+      n += snprintf(str+n, L-n, " x ");
+    }
+  }
+  if(n < L - 2) {
+    snprintf(str+n, L-n, "]");
+  } else {
+    snprintf(str+L-5, 5, "...]");
+  }
+  return buf;
+}
+
+TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement)
+{
+  ptrdiff_t total_size = (size->size > 0 ? 1 : 0);
+  ptrdiff_t dim_infer = -1;
+  ptrdiff_t i;
+  for (i = 0; i < size->size; i++) {
+    if (size->data[i] == -1) {
+      THArgCheck(dim_infer == -1, 1, "only one dimension can be inferred");
+      dim_infer = i;
+    } else {
+      total_size *= size->data[i];
+    }
+  }
+  if (dim_infer != -1) {
+    THDescBuff buf = THLongStorage_sizeDesc(size);
+    THArgCheck(total_size > 0 && nElement % total_size == 0, 2,
+        "size '%s' is invalid for input of with %td elements", buf.str, nElement);
+  } else {
+    THDescBuff buf = THLongStorage_sizeDesc(size);
+    THArgCheck(nElement == total_size, 2,
+        "size '%s' is invalid for input of with %td elements", buf.str, nElement);
+  }
+  THLongStorage* copy = THLongStorage_newWithSize(size->size);
+  THLongStorage_copy(copy, size);
+  if (dim_infer != -1) {
+    copy->data[dim_infer] = nElement / total_size;
+  }
+  return copy;
+}
diff --git a/lib/TH/THStorage.h b/lib/TH/THStorage.h
index 9565e10..df80229 100644
--- a/lib/TH/THStorage.h
+++ b/lib/TH/THStorage.h
@@ -7,6 +7,11 @@
 #define THStorage        TH_CONCAT_3(TH,Real,Storage)
 #define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
 
+#define TH_DESC_BUFF_LEN 64
+typedef struct {
+    char str[TH_DESC_BUFF_LEN];
+} THDescBuff;
+
 /* fast access methods */
 #define TH_STORAGE_GET(storage, idx) ((storage)->data[(idx)])
 #define TH_STORAGE_SET(storage, idx, value) ((storage)->data[(idx)] = (value))
@@ -23,4 +28,7 @@
 #include "generic/THStorageCopy.h"
 #include "THGenerateHalfType.h"
 
+TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size);
+TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement);
+
 #endif
diff --git a/lib/TH/THTensor.c b/lib/TH/THTensor.c
index 37071df..115e396 100644
--- a/lib/TH/THTensor.c
+++ b/lib/TH/THTensor.c
@@ -1,6 +1,7 @@
 #include "THAtomic.h"
 #include "THTensor.h"
 #include "THVector.h"
+#include "generic/simd/simd.h"
 
 #include "THBlas.h"
 #include "THLapack.h"
diff --git a/lib/TH/THTensor.h b/lib/TH/THTensor.h
index a155efd..d2a1c57 100644
--- a/lib/TH/THTensor.h
+++ b/lib/TH/THTensor.h
@@ -7,11 +7,6 @@
 #define THTensor          TH_CONCAT_3(TH,Real,Tensor)
 #define THTensor_(NAME)   TH_CONCAT_4(TH,Real,Tensor_,NAME)
 
-#define TH_DESC_BUFF_LEN 64
-typedef struct {
-    char str[TH_DESC_BUFF_LEN];
-} THDescBuff;
-
 /* basics */
 #include "generic/THTensor.h"
 #include "THGenerateAllTypes.h"
diff --git a/lib/TH/THTensorApply.h b/lib/TH/THTensorApply.h
index 4fd69d4..3e6ed6e 100644
--- a/lib/TH/THTensorApply.h
+++ b/lib/TH/THTensorApply.h
@@ -1,353 +1,6 @@
 #ifndef TH_TENSOR_APPLY_INC
 #define TH_TENSOR_APPLY_INC
 
-#define TH_TENSOR_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
-{ \
-  TYPE1 *TENSOR1##_data = NULL; \
-  long *TENSOR1##_counter = NULL; \
-  long TENSOR1##_stride = 0, TENSOR1##_size = 0, TENSOR1##_dim = 0, TENSOR1##_i, TENSOR1##_n; \
-  TYPE2 *TENSOR2##_data = NULL; \
-  long *TENSOR2##_counter = NULL; \
-  long TENSOR2##_stride = 0, TENSOR2##_size = 0, TENSOR2##_dim = 0, TENSOR2##_i, TENSOR2##_n; \
-  TYPE3 *TENSOR3##_data = NULL; \
-  long *TENSOR3##_counter = NULL; \
-  long TENSOR3##_stride = 0, TENSOR3##_size = 0, TENSOR3##_dim = 0, TENSOR3##_i, TENSOR3##_n; \
-  int TH_TENSOR_APPLY_hasFinished = 0; \
-\
-  TENSOR1##_n = (TENSOR1->nDimension ? 1 : 0); \
-  for(TENSOR1##_i = 0; TENSOR1##_i < TENSOR1->nDimension; TENSOR1##_i++) \
-    TENSOR1##_n *= TENSOR1->size[TENSOR1##_i]; \
-\
-  TENSOR2##_n = (TENSOR2->nDimension ? 1 : 0); \
-  for(TENSOR2##_i = 0; TENSOR2##_i < TENSOR2->nDimension; TENSOR2##_i++) \
-    TENSOR2##_n *= TENSOR2->size[TENSOR2##_i]; \
-\
-  TENSOR3##_n = (TENSOR3->nDimension ? 1 : 0); \
-  for(TENSOR3##_i = 0; TENSOR3##_i < TENSOR3->nDimension; TENSOR3##_i++) \
-    TENSOR3##_n *= TENSOR3->size[TENSOR3##_i]; \
-\
-  if(TENSOR1##_n != TENSOR2##_n || TENSOR1##_n != TENSOR3##_n) /* should we do the check in the function instead? i think so */ \
-    THError("inconsistent tensor size"); \
-\
-  if(TENSOR1->nDimension == 0) \
-    TH_TENSOR_APPLY_hasFinished = 1; \
-  else \
-  { \
-    TENSOR1##_data = TENSOR1->storage->data+TENSOR1->storageOffset; \
-    for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
-    { \
-      if(TENSOR1->size[TENSOR1##_dim] != 1) \
-        break; \
-    } \
-    TENSOR1##_stride = (TENSOR1##_dim == -1 ? 0 : TENSOR1->stride[TENSOR1##_dim]); \
-    TENSOR1##_size = 1; \
-    for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
-    { \
-      if(TENSOR1->size[TENSOR1##_dim] != 1) \
-      { \
-        if(TENSOR1->stride[TENSOR1##_dim] == TENSOR1##_size) \
-          TENSOR1##_size *= TENSOR1->size[TENSOR1##_dim]; \
-        else \
-          break; \
-      } \
-    } \
-    TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(TENSOR1##_dim+1)); \
-    for(TENSOR1##_i = 0; TENSOR1##_i <= TENSOR1##_dim; TENSOR1##_i++) \
-      TENSOR1##_counter[TENSOR1##_i] = 0; \
-\
-    TENSOR2##_data = TENSOR2->storage->data+TENSOR2->storageOffset; \
-    for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
-    { \
-      if(TENSOR2->size[TENSOR2##_dim] != 1) \
-        break; \
-    } \
-    TENSOR2##_stride = (TENSOR2##_dim == -1 ? 0 : TENSOR2->stride[TENSOR2##_dim]); \
-    TENSOR2##_size = 1; \
-    for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
-    { \
-      if(TENSOR2->size[TENSOR2##_dim] != 1) \
-      { \
-        if(TENSOR2->stride[TENSOR2##_dim] == TENSOR2##_size) \
-          TENSOR2##_size *= TENSOR2->size[TENSOR2##_dim]; \
-        else \
-          break; \
-      } \
-    } \
-    TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(TENSOR2##_dim+1)); \
-    for(TENSOR2##_i = 0; TENSOR2##_i <= TENSOR2##_dim; TENSOR2##_i++) \
-      TENSOR2##_counter[TENSOR2##_i] = 0; \
-\
-    TENSOR3##_data = TENSOR3->storage->data+TENSOR3->storageOffset; \
-    for(TENSOR3##_dim = TENSOR3->nDimension-1; TENSOR3##_dim >= 0; TENSOR3##_dim--) \
-    { \
-      if(TENSOR3->size[TENSOR3##_dim] != 1) \
-        break; \
-    } \
-    TENSOR3##_stride = (TENSOR3##_dim == -1 ? 0 : TENSOR3->stride[TENSOR3##_dim]); \
-    TENSOR3##_size = 1; \
-    for(TENSOR3##_dim = TENSOR3->nDimension-1; TENSOR3##_dim >= 0; TENSOR3##_dim--) \
-    { \
-      if(TENSOR3->size[TENSOR3##_dim] != 1) \
-      { \
-        if(TENSOR3->stride[TENSOR3##_dim] == TENSOR3##_size) \
-          TENSOR3##_size *= TENSOR3->size[TENSOR3##_dim]; \
-        else \
-          break; \
-      } \
-    } \
-    TENSOR3##_counter = (long*)THAlloc(sizeof(long)*(TENSOR3##_dim+1)); \
-    for(TENSOR3##_i = 0; TENSOR3##_i <= TENSOR3##_dim; TENSOR3##_i++) \
-      TENSOR3##_counter[TENSOR3##_i] = 0; \
-  } \
-\
-  TENSOR1##_i = 0; \
-  TENSOR2##_i = 0; \
-  TENSOR3##_i = 0; \
-  while(!TH_TENSOR_APPLY_hasFinished) \
-  { \
-    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size && TENSOR3##_i < TENSOR3##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR3##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride, TENSOR3##_data += TENSOR3##_stride) /* 0 et pas TENSOR##_dim! */ \
-    { \
-      CODE \
-    } \
-\
-    if(TENSOR1##_i == TENSOR1##_size) \
-    { \
-      if(TENSOR1##_dim == -1) \
-         break; \
-\
-      TENSOR1##_data -= TENSOR1##_size*TENSOR1##_stride; \
-      for(TENSOR1##_i = TENSOR1##_dim; TENSOR1##_i >= 0; TENSOR1##_i--) \
-      { \
-        TENSOR1##_counter[TENSOR1##_i]++; \
-        TENSOR1##_data += TENSOR1->stride[TENSOR1##_i]; \
-\
-        if(TENSOR1##_counter[TENSOR1##_i]  == TENSOR1->size[TENSOR1##_i]) \
-        { \
-          if(TENSOR1##_i == 0) \
-          { \
-            TH_TENSOR_APPLY_hasFinished = 1; \
-            break; \
-          } \
-            else \
-          { \
-            TENSOR1##_data -= TENSOR1##_counter[TENSOR1##_i]*TENSOR1->stride[TENSOR1##_i]; \
-            TENSOR1##_counter[TENSOR1##_i] = 0; \
-          } \
-        } \
-        else \
-          break; \
-      } \
-      TENSOR1##_i = 0; \
-    } \
-\
-    if(TENSOR2##_i == TENSOR2##_size) \
-    { \
-      if(TENSOR2##_dim == -1) \
-         break; \
-\
-      TENSOR2##_data -= TENSOR2##_size*TENSOR2##_stride; \
-      for(TENSOR2##_i = TENSOR2##_dim; TENSOR2##_i >= 0; TENSOR2##_i--) \
-      { \
-        TENSOR2##_counter[TENSOR2##_i]++; \
-        TENSOR2##_data += TENSOR2->stride[TENSOR2##_i]; \
-\
-        if(TENSOR2##_counter[TENSOR2##_i]  == TENSOR2->size[TENSOR2##_i]) \
-        { \
-          if(TENSOR2##_i == 0) \
-          { \
-            TH_TENSOR_APPLY_hasFinished = 1; \
-            break; \
-          } \
-            else \
-          { \
-            TENSOR2##_data -= TENSOR2##_counter[TENSOR2##_i]*TENSOR2->stride[TENSOR2##_i]; \
-            TENSOR2##_counter[TENSOR2##_i] = 0; \
-          } \
-        } \
-        else \
-          break; \
-      } \
-      TENSOR2##_i = 0; \
-    } \
-\
-    if(TENSOR3##_i == TENSOR3##_size) \
-    { \
-      if(TENSOR3##_dim == -1) \
-         break; \
-\
-      TENSOR3##_data -= TENSOR3##_size*TENSOR3##_stride; \
-      for(TENSOR3##_i = TENSOR3##_dim; TENSOR3##_i >= 0; TENSOR3##_i--) \
-      { \
-        TENSOR3##_counter[TENSOR3##_i]++; \
-        TENSOR3##_data += TENSOR3->stride[TENSOR3##_i]; \
-\
-        if(TENSOR3##_counter[TENSOR3##_i]  == TENSOR3->size[TENSOR3##_i]) \
-        { \
-          if(TENSOR3##_i == 0) \
-          { \
-            TH_TENSOR_APPLY_hasFinished = 1; \
-            break; \
-          } \
-            else \
-          { \
-            TENSOR3##_data -= TENSOR3##_counter[TENSOR3##_i]*TENSOR3->stride[TENSOR3##_i]; \
-            TENSOR3##_counter[TENSOR3##_i] = 0; \
-          } \
-        } \
-        else \
-          break; \
-      } \
-      TENSOR3##_i = 0; \
-    } \
-  } \
-  THFree(TENSOR1##_counter); \
-  THFree(TENSOR2##_counter); \
-  THFree(TENSOR3##_counter); \
-}
-
-#define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
-{ \
-  TYPE1 *TENSOR1##_data = NULL; \
-  long *TENSOR1##_counter = NULL; \
-  long TENSOR1##_stride = 0, TENSOR1##_size = 0, TENSOR1##_dim = 0, TENSOR1##_i, TENSOR1##_n; \
-  TYPE2 *TENSOR2##_data = NULL; \
-  long *TENSOR2##_counter = NULL; \
-  long TENSOR2##_stride = 0, TENSOR2##_size = 0, TENSOR2##_dim = 0, TENSOR2##_i, TENSOR2##_n; \
-  int TH_TENSOR_APPLY_hasFinished = 0; \
-\
-  TENSOR1##_n = (TENSOR1->nDimension ? 1 : 0); \
-  for(TENSOR1##_i = 0; TENSOR1##_i < TENSOR1->nDimension; TENSOR1##_i++) \
-    TENSOR1##_n *= TENSOR1->size[TENSOR1##_i]; \
-\
-  TENSOR2##_n = (TENSOR2->nDimension ? 1 : 0); \
-  for(TENSOR2##_i = 0; TENSOR2##_i < TENSOR2->nDimension; TENSOR2##_i++) \
-    TENSOR2##_n *= TENSOR2->size[TENSOR2##_i]; \
-\
-  if(TENSOR1##_n != TENSOR2##_n) /* should we do the check in the function instead? i think so */ \
-    THError("inconsistent tensor size"); \
-\
-  if(TENSOR1->nDimension == 0) \
-    TH_TENSOR_APPLY_hasFinished = 1; \
-  else \
-  { \
-    TENSOR1##_data = TENSOR1->storage->data+TENSOR1->storageOffset; \
-    for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
-    { \
-      if(TENSOR1->size[TENSOR1##_dim] != 1) \
-        break; \
-    } \
-    TENSOR1##_stride = (TENSOR1##_dim == -1 ? 0 : TENSOR1->stride[TENSOR1##_dim]); \
-    TENSOR1##_size = 1; \
-    for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
-    { \
-      if(TENSOR1->size[TENSOR1##_dim] != 1) \
-      { \
-        if(TENSOR1->stride[TENSOR1##_dim] == TENSOR1##_size) \
-          TENSOR1##_size *= TENSOR1->size[TENSOR1##_dim]; \
-        else \
-          break; \
-      } \
-    } \
-    TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(TENSOR1##_dim+1)); \
-    for(TENSOR1##_i = 0; TENSOR1##_i <= TENSOR1##_dim; TENSOR1##_i++) \
-      TENSOR1##_counter[TENSOR1##_i] = 0; \
-\
-    TENSOR2##_data = TENSOR2->storage->data+TENSOR2->storageOffset; \
-    for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
-    { \
-      if(TENSOR2->size[TENSOR2##_dim] != 1) \
-        break; \
-    } \
-    TENSOR2##_stride = (TENSOR2##_dim == -1 ? 0 : TENSOR2->stride[TENSOR2##_dim]); \
-    TENSOR2##_size = 1; \
-    for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
-    { \
-      if(TENSOR2->size[TENSOR2##_dim] != 1) \
-      { \
-        if(TENSOR2->stride[TENSOR2##_dim] == TENSOR2##_size) \
-          TENSOR2##_size *= TENSOR2->size[TENSOR2##_dim]; \
-        else \
-          break; \
-      } \
-    } \
-    TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(TENSOR2##_dim+1)); \
-    for(TENSOR2##_i = 0; TENSOR2##_i <= TENSOR2##_dim; TENSOR2##_i++) \
-      TENSOR2##_counter[TENSOR2##_i] = 0; \
-  } \
-\
-  TENSOR1##_i = 0; \
-  TENSOR2##_i = 0; \
-  while(!TH_TENSOR_APPLY_hasFinished) \
-  { \
-    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \
-    { \
-      CODE \
-    } \
-\
-    if(TENSOR1##_i == TENSOR1##_size) \
-    { \
-      if(TENSOR1##_dim == -1) \
-         break; \
-\
-      TENSOR1##_data -= TENSOR1##_size*TENSOR1##_stride; \
-      for(TENSOR1##_i = TENSOR1##_dim; TENSOR1##_i >= 0; TENSOR1##_i--) \
-      { \
-        TENSOR1##_counter[TENSOR1##_i]++; \
-        TENSOR1##_data += TENSOR1->stride[TENSOR1##_i]; \
-\
-        if(TENSOR1##_counter[TENSOR1##_i]  == TENSOR1->size[TENSOR1##_i]) \
-        { \
-          if(TENSOR1##_i == 0) \
-          { \
-            TH_TENSOR_APPLY_hasFinished = 1; \
-            break; \
-          } \
-            else \
-          { \
-            TENSOR1##_data -= TENSOR1##_counter[TENSOR1##_i]*TENSOR1->stride[TENSOR1##_i]; \
-            TENSOR1##_counter[TENSOR1##_i] = 0; \
-          } \
-        } \
-        else \
-          break; \
-      } \
-      TENSOR1##_i = 0; \
-    } \
-\
-    if(TENSOR2##_i == TENSOR2##_size) \
-    { \
-      if(TENSOR2##_dim == -1) \
-         break; \
-\
-      TENSOR2##_data -= TENSOR2##_size*TENSOR2##_stride; \
-      for(TENSOR2##_i = TENSOR2##_dim; TENSOR2##_i >= 0; TENSOR2##_i--) \
-      { \
-        TENSOR2##_counter[TENSOR2##_i]++; \
-        TENSOR2##_data += TENSOR2->stride[TENSOR2##_i]; \
-\
-        if(TENSOR2##_counter[TENSOR2##_i]  == TENSOR2->size[TENSOR2##_i]) \
-        { \
-          if(TENSOR2##_i == 0) \
-          { \
-            TH_TENSOR_APPLY_hasFinished = 1; \
-            break; \
-          } \
-            else \
-          { \
-            TENSOR2##_data -= TENSOR2##_counter[TENSOR2##_i]*TENSOR2->stride[TENSOR2##_i]; \
-            TENSOR2##_counter[TENSOR2##_i] = 0; \
-          } \
-        } \
-        else \
-          break; \
-      } \
-      TENSOR2##_i = 0; \
-    } \
-  } \
-  THFree(TENSOR1##_counter); \
-  THFree(TENSOR2##_counter); \
-}
-
 /*
  * The basic strategy for apply is as follows:
  *
@@ -370,95 +23,198 @@
  * Tensor. But because we are guaranteed the subsequent data is contiguous in memory, we
  * can simply loop for sizeof(A) iterations and perform the operation, without having to
  * follow the order described by the strides of A.
+ *
+ * 3. As an optimization, we merge dimensions of A that are contiguous in memory. For
+ * example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor, then the first two
+ * dimensions can be merged for the purposes of APPLY, reducing the number of nested
+ * loops.
  */
-#define TH_TENSOR_APPLY(TYPE, TENSOR, CODE) \
-{ \
+
+#define __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, ALLOW_CONTIGUOUS) \
   TYPE *TENSOR##_data = NULL; \
-  long *TENSOR##_counter = NULL; \
-  long TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i; \
-  int TH_TENSOR_APPLY_hasFinished = 0; \
+  long *TENSOR##_counter = NULL, *TENSOR##_sizes = NULL, *TENSOR##_strides = NULL, *TENSOR##_dimOffset = NULL; \
+  long TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i, TENSOR##_n; \
+  int TENSOR##_contiguous = ALLOW_CONTIGUOUS; \
+  TENSOR##_n = (TENSOR->nDimension ? 1 : 0); \
+  for(TENSOR##_i = 0; TENSOR##_i < TENSOR->nDimension; TENSOR##_i++) \
+    TENSOR##_n *= TENSOR->size[TENSOR##_i]; \
 \
   if(TENSOR->nDimension == 0) \
     TH_TENSOR_APPLY_hasFinished = 1; \
   else \
   { \
     TENSOR##_data = TENSOR->storage->data+TENSOR->storageOffset; \
-\
-    /* what is the first stride (ignore first dims=1)? */ \
-    /* it will be used for offset updates while looping through the largest contiguous section */ \
-    for(TENSOR##_dim = TENSOR->nDimension-1; TENSOR##_dim >= 0; TENSOR##_dim--) \
-    { \
-      if(TENSOR->size[TENSOR##_dim] != 1) \
-        break; \
-    } \
-    TENSOR##_stride = (TENSOR##_dim == -1 ? 0 : TENSOR->stride[TENSOR##_dim]); \
-\
-    /* what is the largest contiguous section? size will store the size of this section */ \
     TENSOR##_size = 1; \
-    for(TENSOR##_dim = TENSOR->nDimension-1; TENSOR##_dim >= 0; TENSOR##_dim--) \
-    { \
-      if(TENSOR->size[TENSOR##_dim] != 1) \
-      { \
-        if(TENSOR->stride[TENSOR##_dim] == TENSOR##_size) \
-          TENSOR##_size *= TENSOR->size[TENSOR##_dim]; \
-        else \
+    TENSOR##_stride = 1; \
+    for(TENSOR##_i = TENSOR->nDimension-1; TENSOR##_i >= 0; TENSOR##_i--) { \
+      if(TENSOR->size[TENSOR##_i] != 1) { \
+        if(TENSOR->stride[TENSOR##_i] == TENSOR##_size && TENSOR##_i != DIM) \
+          TENSOR##_size *= TENSOR->size[TENSOR##_i]; \
+        else{ \
+          TENSOR##_contiguous = 0; \
           break; \
+        } \
       } \
     } \
-\
-    /* allocate an array of k+1 elements, where k is the first index that */ \
-    /* break contiguity. Note that if the tensor is contiguous, then k is -1 and */ \
-    /* this counter array is empty. */ \
-\
-    /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \
-    /* storage is given by storage_offset + (i * j), where i is the stride */ \
-    /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \
-    TENSOR##_counter = (long*)THAlloc(sizeof(long)*(TENSOR##_dim+1)); \
-    for(TENSOR##_i = 0; TENSOR##_i <= TENSOR##_dim; TENSOR##_i++) \
-      TENSOR##_counter[TENSOR##_i] = 0; \
+    if (!TENSOR##_contiguous) { \
+      /* Find the dimension of contiguous sections */ \
+      TENSOR##_dim = 1; \
+      for(TENSOR##_i = TENSOR->nDimension-2; TENSOR##_i >= 0; TENSOR##_i--) \
+      { \
+        if(TENSOR->stride[TENSOR##_i] != TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] || TENSOR##_i == DIM) \
+          TENSOR##_dim++; \
+      } \
+      /* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \
+      TENSOR##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR##_dim)); \
+      TENSOR##_sizes = TENSOR##_counter + TENSOR##_dim; \
+      TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \
+      TH_TENSOR_dim_index = TENSOR##_dim-1; \
+      TENSOR##_dimOffset = &TENSOR##_counter[DIM]; \
+      TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR->nDimension-1]; \
+      TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR->nDimension-1]; \
+      /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \
+      /* storage is given by storage_offset + (i * j), where i is the stride */ \
+      /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \
+      for(TENSOR##_i = TENSOR##_dim-1; TENSOR##_i >= 0; --TENSOR##_i) { \
+        TENSOR##_counter[TENSOR##_i] = 0; \
+      } \
+      for(TENSOR##_i = TENSOR->nDimension-2; TENSOR##_i >= 0; --TENSOR##_i) { \
+        if (TENSOR->stride[TENSOR##_i] == TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] && TENSOR##_i != DIM) { \
+          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i] * TENSOR##_sizes[TH_TENSOR_dim_index]; \
+          if (TENSOR##_i < DIM) \
+            TENSOR##_dimOffset--; \
+        } else { \
+          --TH_TENSOR_dim_index; \
+          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i]; \
+          TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR##_i]; \
+        } \
+      } \
+      /* Size of the inner most section */ \
+      TENSOR##_size = TENSOR##_sizes[TENSOR##_dim-1]; \
+      /* Stride of the inner most section */ \
+      TENSOR##_stride = TENSOR##_strides[TENSOR##_dim-1]; \
+    } \
   } \
-\
-  while(!TH_TENSOR_APPLY_hasFinished) \
+  TENSOR##_i = 0;
+
+#define  __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, ALWAYS_UPDATE) \
+  if(TENSOR##_i == TENSOR##_size || ALWAYS_UPDATE) \
   { \
-    /* Loop through the contiguous section of the Tensor */ \
-    for(TENSOR##_i = 0; TENSOR##_i < TENSOR##_size; TENSOR##_i++, TENSOR##_data += TENSOR##_stride) /* 0 et pas TENSOR##_dim! */ \
-    { \
-      CODE \
-    } \
-\
+    if(TENSOR##_contiguous) \
+      break; \
 \
-    /* Handle corner case where the entire Tensor was contiguous */ \
-    if(TENSOR##_dim == -1) \
+    if(TENSOR##_dim == 1) \
        break; \
- \
+\
     /* Reset pointer to beginning of loop */ \
-    TENSOR##_data -= TENSOR##_i*TENSOR##_stride; \
-    for(TENSOR##_i = TENSOR##_dim; TENSOR##_i >= 0; TENSOR##_i--) \
+    TENSOR##_data -= TENSOR##_size*TENSOR##_stride; \
+    for(TENSOR##_i = TENSOR##_dim-2; TENSOR##_i >= 0; TENSOR##_i--) \
     { \
       TENSOR##_counter[TENSOR##_i]++; \
-\
       /* Jump ahread by the stride of this dimension */ \
-      TENSOR##_data += TENSOR->stride[TENSOR##_i]; \
+      TENSOR##_data += TENSOR##_strides[TENSOR##_i]; \
 \
-      if(TENSOR##_counter[TENSOR##_i]  == TENSOR->size[TENSOR##_i]) \
+      if(TENSOR##_counter[TENSOR##_i]  == TENSOR##_sizes[TENSOR##_i]) \
       { \
         if(TENSOR##_i == 0) \
         { \
           TH_TENSOR_APPLY_hasFinished = 1; \
           break; \
         } \
-        else \
+          else \
         { \
           /* Reset the pointer to the beginning of the chunk defined by this dimension */ \
-          TENSOR##_data -= TENSOR##_counter[TENSOR##_i]*TENSOR->stride[TENSOR##_i]; \
+          TENSOR##_data -= TENSOR##_counter[TENSOR##_i]*TENSOR##_strides[TENSOR##_i]; \
           TENSOR##_counter[TENSOR##_i] = 0; \
         } \
       } \
       else \
         break; \
     } \
+    TENSOR##_i = 0; \
+  } \
+
+#define TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIM, CODE) \
+{ \
+  int TH_TENSOR_APPLY_hasFinished = 0; \
+  long TH_TENSOR_dim_index = 0; \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, DIM, 1) \
+\
+  if(TENSOR1##_n != TENSOR2##_n || TENSOR1##_n != TENSOR3##_n) /* should we do the check in the function instead? i think so */ \
+    THError("inconsistent tensor size"); \
+\
+  while(!TH_TENSOR_APPLY_hasFinished) \
+  { \
+    /* Loop through the inner most region of the Tensor */ \
+    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size && TENSOR3##_i < TENSOR3##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR3##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride, TENSOR3##_data += TENSOR3##_stride) /* 0 et pas TENSOR##_dim! */ \
+    { \
+      CODE \
+    } \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR3, 0) \
+  } \
+  if(TENSOR1##_counter != NULL) \
+    THFree(TENSOR1##_counter); \
+  if(TENSOR2##_counter != NULL) \
+    THFree(TENSOR2##_counter); \
+  if(TENSOR3##_counter != NULL) \
+    THFree(TENSOR3##_counter); \
+}
+
+#define TH_TENSOR_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
+  TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, -1, CODE)
+
+#define TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, DIM, CODE) \
+{ \
+  int TH_TENSOR_APPLY_hasFinished = 0; \
+  long TH_TENSOR_dim_index = 0; \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
+\
+  if(TENSOR1##_n != TENSOR2##_n) /* should we do the check in the function instead? i think so */ \
+    THError("inconsistent tensor size"); \
+\
+  while(!TH_TENSOR_APPLY_hasFinished) \
+  { \
+    /* Loop through the inner most region of the Tensor */ \
+    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \
+    { \
+      CODE \
+    } \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
+  } \
+  if(TENSOR1##_counter != NULL) \
+    THFree(TENSOR1##_counter); \
+  if(TENSOR2##_counter != NULL) \
+    THFree(TENSOR2##_counter); \
+}
+
+#define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+  TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, -1, CODE)
+
+#define TH_TENSOR_APPLY_D(TYPE, TENSOR, DIM, CODE) \
+{ \
+  int TH_TENSOR_APPLY_hasFinished = 0; \
+  long TH_TENSOR_dim_index = 0; \
+  __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, 0) \
+\
+  while(!TH_TENSOR_APPLY_hasFinished) \
+  { \
+    /* Loop through the inner most region of the Tensor */ \
+    for(; TENSOR##_i < TENSOR##_size; TENSOR##_i++, TENSOR##_data += TENSOR##_stride) /* 0 et pas TENSOR##_dim! */ \
+    { \
+      CODE \
+    } \
+    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, 1) \
   } \
   THFree(TENSOR##_counter); \
 }
 
+#define TH_TENSOR_APPLY(TYPE, TENSOR, CODE) \
+  TH_TENSOR_APPLY_D(TYPE, TENSOR, -1, CODE)
+
 #endif
diff --git a/lib/TH/THVector.c b/lib/TH/THVector.c
index 907adbb..4410578 100644
--- a/lib/TH/THVector.c
+++ b/lib/TH/THVector.c
@@ -15,6 +15,14 @@
 #include "vector/SSE.c"
 #endif
 
+#if defined(USE_AVX)
+#include "vector/AVX.h"
+#endif
+
+#if defined(USE_AVX2)
+#include "vector/AVX2.h"
+#endif
+
 #include "generic/THVectorDefault.c"
 #include "THGenerateAllTypes.h"
 
diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index e68ae6a..7c9325a 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -50,7 +50,7 @@ ENDIF ("${SIZE_OF_VOIDP}" EQUAL 8)
 IF(CMAKE_COMPILER_IS_GNUCC)
   SET(mklthreads "mkl_gnu_thread" "mkl_intel_thread")
   SET(mklifaces  "gf" "intel")
-  SET(mklrtls)
+  SET(mklrtls "iomp5")
 ELSE(CMAKE_COMPILER_IS_GNUCC)
   SET(mklthreads "mkl_intel_thread")
   SET(mklifaces  "intel")
diff --git a/lib/TH/cmake/FindSSE.cmake b/lib/TH/cmake/FindSSE.cmake
index f6aac07..f84ce89 100644
--- a/lib/TH/cmake/FindSSE.cmake
+++ b/lib/TH/cmake/FindSSE.cmake
@@ -62,12 +62,23 @@ SET(AVX_CODE "
 
   int main()
   {
-     __m256 a;
+    __m256 a;
     a = _mm256_set1_ps(0);
     return 0;
   }
 ")
 
+SET(AVX2_CODE "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m256i a;
+    a = _mm256_abs_epi16(a);
+    return 0;
+  }
+")
+
 MACRO(CHECK_SSE lang type flags)
   SET(__FLAG_I 1)
   SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
@@ -103,9 +114,12 @@ CHECK_SSE(C "SSE3" " ;-msse3;/arch:SSE3")
 CHECK_SSE(C "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
 CHECK_SSE(C "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
 CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE(C "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
 
 CHECK_SSE(CXX "SSE1" " ;-msse;/arch:SSE")
 CHECK_SSE(CXX "SSE2" " ;-msse2;/arch:SSE2")
 CHECK_SSE(CXX "SSE3" " ;-msse3;/arch:SSE3")
 CHECK_SSE(CXX "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
 CHECK_SSE(CXX "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
+CHECK_SSE(CXX "AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE(CXX "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
diff --git a/lib/TH/generic/THTensor.c b/lib/TH/generic/THTensor.c
index 13de6d9..7d88bef 100644
--- a/lib/TH/generic/THTensor.c
+++ b/lib/TH/generic/THTensor.c
@@ -67,8 +67,6 @@ void THTensor_(clearFlag)(THTensor *self, const char flag)
 /**** creation methods ****/
 
 static void THTensor_(rawInit)(THTensor *self);
-static void THTensor_(rawSet)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride);
-static void THTensor_(rawResize)(THTensor *self, int nDimension, long *size, long *stride);
 
 
 /* Empty init */
@@ -84,12 +82,12 @@ THTensor *THTensor_(newWithTensor)(THTensor *tensor)
 {
   THTensor *self = THAlloc(sizeof(THTensor));
   THTensor_(rawInit)(self);
-  THTensor_(rawSet)(self,
-                    tensor->storage,
-                    tensor->storageOffset,
-                    tensor->nDimension,
-                    tensor->size,
-                    tensor->stride);
+  THTensor_(setStorageNd)(self,
+                          tensor->storage,
+                          tensor->storageOffset,
+                          tensor->nDimension,
+                          tensor->size,
+                          tensor->stride);
   return self;
 }
 
@@ -104,12 +102,12 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset,
 #ifdef DEBUG
   THAssert((size ? size->size : (stride ? stride->size : 0)) <= INT_MAX);
 #endif
-  THTensor_(rawSet)(self,
-                    storage,
-                    storageOffset,
-                    (size ? size->size : (stride ? stride->size : 0)),
-                    (size ? size->data : NULL),
-                    (stride ? stride->data : NULL));
+  THTensor_(setStorageNd)(self,
+                          storage,
+                          storageOffset,
+                          (size ? size->size : (stride ? stride->size : 0)),
+                          (size ? size->data : NULL),
+                          (stride ? stride->data : NULL));
 
   return self;
 }
@@ -145,7 +143,7 @@ THTensor *THTensor_(newWithStorage4d)(THStorage *storage, ptrdiff_t storageOffse
 
   THTensor *self = THAlloc(sizeof(THTensor));
   THTensor_(rawInit)(self);
-  THTensor_(rawSet)(self, storage, storageOffset, 4, size, stride);
+  THTensor_(setStorageNd)(self, storage, storageOffset, 4, size, stride);
 
   return self;
 }
@@ -176,7 +174,7 @@ THTensor *THTensor_(newWithSize4d)(long size0, long size1, long size2, long size
 
   THTensor *self = THAlloc(sizeof(THTensor));
   THTensor_(rawInit)(self);
-  THTensor_(rawResize)(self, 4, size, NULL);
+  THTensor_(resizeNd)(self, 4, size, NULL);
 
   return self;
 }
@@ -228,6 +226,17 @@ THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, lon
   return self;
 }
 
+THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size)
+{
+  THArgCheck(THTensor_(isContiguous)(tensor), 1, "input is not contiguous");
+  ptrdiff_t numel = THTensor_(nElement)(tensor);
+  THTensor *self = THTensor_(new)();
+  THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
+  THTensor_(setStorage)(self, tensor->storage, tensor->storageOffset, inferred_size, NULL);
+  THLongStorage_free(inferred_size);
+  return self;
+}
+
 /* Resize */
 void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *stride)
 {
@@ -238,13 +247,13 @@ void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *strid
 #ifdef DEBUG
   THAssert(size->size <= INT_MAX);
 #endif
-  THTensor_(rawResize)(self, size->size, size->data, (stride ? stride->data : NULL));
+  THTensor_(resizeNd)(self, size->size, size->data, (stride ? stride->data : NULL));
 }
 
 void THTensor_(resizeAs)(THTensor *self, THTensor *src)
 {
   if(!THTensor_(isSameSizeAs)(self, src))
-    THTensor_(rawResize)(self, src->nDimension, src->size, NULL);
+    THTensor_(resizeNd)(self, src->nDimension, src->size, NULL);
 }
 
 void THTensor_(resize1d)(THTensor *tensor, long size0)
@@ -266,25 +275,25 @@ void THTensor_(resize4d)(THTensor *self, long size0, long size1, long size2, lon
 {
   long size[4] = {size0, size1, size2, size3};
 
-  THTensor_(rawResize)(self, 4, size, NULL);
+  THTensor_(resizeNd)(self, 4, size, NULL);
 }
 
 void THTensor_(resize5d)(THTensor *self, long size0, long size1, long size2, long size3, long size4)
 {
     long size[5] = {size0, size1, size2, size3, size4};
 
-  THTensor_(rawResize)(self, 5, size, NULL);
+  THTensor_(resizeNd)(self, 5, size, NULL);
 }
 
 void THTensor_(set)(THTensor *self, THTensor *src)
 {
   if(self != src)
-    THTensor_(rawSet)(self,
-                      src->storage,
-                      src->storageOffset,
-                      src->nDimension,
-                      src->size,
-                      src->stride);
+    THTensor_(setStorageNd)(self,
+                            src->storage,
+                            src->storageOffset,
+                            src->nDimension,
+                            src->size,
+                            src->stride);
 }
 
 void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
@@ -295,12 +304,12 @@ void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storag
 #ifdef DEBUG
   THAssert((size_ ? size_->size : (stride_ ? stride_->size : 0)) <= INT_MAX);
 #endif
-  THTensor_(rawSet)(self,
-                    storage_,
-                    storageOffset_,
-                    (size_ ? size_->size : (stride_ ? stride_->size : 0)),
-                    (size_ ? size_->data : NULL),
-                    (stride_ ? stride_->data : NULL));
+  THTensor_(setStorageNd)(self,
+                          storage_,
+                          storageOffset_,
+                          (size_ ? size_->size : (stride_ ? stride_->size : 0)),
+                          (size_ ? size_->data : NULL),
+                          (stride_ ? stride_->data : NULL));
 }
 
 void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
@@ -346,7 +355,7 @@ void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, ptrdiff_t stor
   long size[4] = {size0_, size1_, size2_, size3_};
   long stride[4] = {stride0_, stride1_, stride2_, stride3_};
 
-  THTensor_(rawSet)(self, storage_, storageOffset_, 4, size, stride);
+  THTensor_(setStorageNd)(self, storage_, storageOffset_, 4, size, stride);
 }
 
 
@@ -401,7 +410,7 @@ void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dim
   THTensor_(set)(self, src);
 
   if(dimension1 == dimension2)
-	  return;
+    return;
 
   z = self->stride[dimension1];
   self->stride[dimension1] = self->stride[dimension2];
@@ -510,6 +519,57 @@ void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension)
   }
 }
 
+void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 2, "dimension out of range");
+  THArgCheck(src->nDimension > 0, 2, "cannot unsqueeze empty tensor");
+
+  THTensor_(set)(self, src);
+
+  self->size = (long*)THRealloc(self->size, sizeof(long)*(self->nDimension+1));
+  self->stride = (long*)THRealloc(self->stride, sizeof(long)*(self->nDimension+1));
+  self->nDimension++;
+  for (d = self->nDimension-1; d > dimension; d--) {
+    self->size[d] = self->size[d-1];
+    self->stride[d] = self->stride[d-1];
+  }
+  if (dimension+1 < self->nDimension) {
+    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+  } else {
+    self->stride[dimension] = 1;
+  }
+  self->size[dimension] = 1;
+}
+
+int THTensor_(isTransposed)(const THTensor *self)
+{
+  if (THTensor_(isContiguous)(self)) {
+    return 0;
+  }
+  long max_stride = 1;
+  long size_max_stride = 1;
+  long z = 1;
+  int d;
+  for (d = 0; d < self->nDimension; ++d) {
+    if (self->stride[d] == 0 && self->size[d] != 1)
+      return 0;
+    if (self->stride[d] > max_stride) {
+      max_stride = self->stride[d];
+      size_max_stride = self->size[d];
+    }
+    z *= self->size[d];
+  }
+  if (z == max_stride * size_max_stride) {
+    return 1;
+  }
+  return 0;
+}
+
 int THTensor_(isContiguous)(const THTensor *self)
 {
   long z = 1;
@@ -632,7 +692,7 @@ static void THTensor_(rawInit)(THTensor *self)
   self->flag = TH_TENSOR_REFCOUNTED;
 }
 
-static void THTensor_(rawSet)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
+void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
 {
   /* storage */
   if(self->storage != storage)
@@ -655,10 +715,10 @@ static void THTensor_(rawSet)(THTensor *self, THStorage *storage, ptrdiff_t stor
   self->storageOffset = storageOffset;
 
   /* size and stride */
-  THTensor_(rawResize)(self, nDimension, size, stride);
+  THTensor_(resizeNd)(self, nDimension, size, stride);
 }
 
-static void THTensor_(rawResize)(THTensor *self, int nDimension, long *size, long *stride)
+void THTensor_(resizeNd)(THTensor *self, int nDimension, long *size, long *stride)
 {
   int d;
   int nDimension_;
@@ -804,24 +864,9 @@ THDescBuff THTensor_(desc)(const THTensor *tensor) {
 }
 
 THDescBuff THTensor_(sizeDesc)(const THTensor *tensor) {
-  const int L = TH_DESC_BUFF_LEN;
-  THDescBuff buf;
-  char *str = buf.str;
-  int n = 0;
-  n += snprintf(str, L-n, "[");
-  int i;
-  for(i = 0; i < tensor->nDimension; i++) {
-    if(n >= L) break;
-    n += snprintf(str+n, L-n, "%ld", tensor->size[i]);
-    if(i < tensor->nDimension-1) {
-      n += snprintf(str+n, L-n, " x ");
-    }
-  }
-  if(n < L - 2) {
-    snprintf(str+n, L-n, "]");
-  } else {
-    snprintf(str+L-5, 5, "...]");
-  }
+  THLongStorage *size = THTensor_(newSizeOf)((THTensor*)tensor);
+  THDescBuff buf = THLongStorage_sizeDesc(size);
+  THLongStorage_free(size);
   return buf;
 }
 
diff --git a/lib/TH/generic/THTensor.h b/lib/TH/generic/THTensor.h
index 81e3cb0..2fac0a8 100644
--- a/lib/TH/generic/THTensor.h
+++ b/lib/TH/generic/THTensor.h
@@ -11,7 +11,7 @@ typedef struct THTensor
     long *size;
     long *stride;
     int nDimension;
-    
+
     THStorage *storage;
     ptrdiff_t storageOffset;
     int refcount;
@@ -68,9 +68,11 @@ TH_API THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, long sli
 TH_API THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, long firstIndex_, long size_);
 TH_API THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_);
 TH_API THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, long step_);
-  
+TH_API THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size);
+
 TH_API void THTensor_(resize)(THTensor *tensor, THLongStorage *size, THLongStorage *stride);
 TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src);
+TH_API void THTensor_(resizeNd)(THTensor *tensor, int nDimension, long *size, long *stride);
 TH_API void THTensor_(resize1d)(THTensor *tensor, long size0_);
 TH_API void THTensor_(resize2d)(THTensor *tensor, long size0_, long size1_);
 TH_API void THTensor_(resize3d)(THTensor *tensor, long size0_, long size1_, long size2_);
@@ -79,6 +81,7 @@ TH_API void THTensor_(resize5d)(THTensor *tensor, long size0_, long size1_, long
 
 TH_API void THTensor_(set)(THTensor *self, THTensor *src);
 TH_API void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+TH_API void THTensor_(setStorageNd)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, int nDimension, long *size, long *stride);
 TH_API void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
                                     long size0_, long stride0_);
 TH_API void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
@@ -101,6 +104,7 @@ TH_API void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension_, lon
 
 TH_API void THTensor_(squeeze)(THTensor *self, THTensor *src);
 TH_API void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension_);
+TH_API void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension_);
 
 TH_API int THTensor_(isContiguous)(const THTensor *self);
 TH_API int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor *src);
diff --git a/lib/TH/generic/THTensorConv.c b/lib/TH/generic/THTensorConv.c
index 1e21991..684ff9d 100644
--- a/lib/TH/generic/THTensorConv.c
+++ b/lib/TH/generic/THTensorConv.c
@@ -44,7 +44,7 @@ void THTensor_(validXCorr2Dptr)(real *r_,
       for (ky = 0; ky < kr; ky++) {
         real *pis_ = pi_;
         for (kx = 0; kx < kc; kx++) {
-          THVector_(add)(r_, pis_, alpha*pw_[kx], oc);
+          THVector_(cadd)(r_, r_, pis_, alpha*pw_[kx], oc);
           pis_++;
         }
         pi_ += ic; /* next input line */
@@ -97,7 +97,7 @@ void THTensor_(validConv2Dptr)(real *r_,
       for (ky = 0; ky < kr; ky++) {
         real *pis_ = pi_;
         for (kx = 0; kx < kc; kx++) {
-          THVector_(add)(r_, pis_, alpha*pw_[-kx], oc);
+          THVector_(cadd)(r_, r_, pis_, alpha*pw_[-kx], oc);
           pis_++;
         }
         pi_ += ic; /* next input line */
@@ -149,7 +149,7 @@ void THTensor_(fullConv2Dptr)(real *r_,
       for (ky = 0; ky < kr; ky++) {
         real *pos_ = po_;
         for (kx = 0; kx < kc; kx++) {
-          THVector_(add)(pos_, t_, alpha*pw_[kx], ic);
+          THVector_(cadd)(pos_, pos_, t_, alpha*pw_[kx], ic);
           pos_++;
         }
         po_ += oc; /* next input line */
@@ -202,7 +202,7 @@ void THTensor_(fullXCorr2Dptr)(real *r_,
       for (ky = 0; ky < kr; ky++) {
         real *pos_ = po_;
         for (kx = 0; kx < kc; kx++) {
-          THVector_(add)(pos_, t_, pw_[-kx]*alpha, ic);
+          THVector_(cadd)(pos_, pos_, t_, pw_[-kx]*alpha, ic);
           pos_++;
         }
         po_ += oc; /* next input line */
@@ -255,7 +255,7 @@ void THTensor_(validXCorr2DRevptr)(real *r_,
         real z = *k_++ * alpha;
 
         for(ky = 0; ky < or; ky++) {
-          THVector_(add)(po_, pi_, z, oc);
+          THVector_(cadd)(po_, po_, pi_, z, oc);
           pi_ += ic;
           po_ += oc;
         }
diff --git a/lib/TH/generic/THTensorCopy.c b/lib/TH/generic/THTensorCopy.c
index 3d243e3..e909728 100644
--- a/lib/TH/generic/THTensorCopy.c
+++ b/lib/TH/generic/THTensorCopy.c
@@ -4,7 +4,18 @@
 
 void THTensor_(copy)(THTensor *tensor, THTensor *src)
 {
-  TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;)
+  if (THTensor_(isContiguous)(tensor) && THTensor_(isContiguous)(src) && THTensor_(nElement)(tensor) == THTensor_(nElement)(src)) {
+    real *sp = THTensor_(data)(src);
+    real *rp = THTensor_(data)(tensor);
+    ptrdiff_t sz = THTensor_(nElement)(tensor);
+#ifndef TH_REAL_IS_HALF
+    THVector_(copy)(rp, sp, sz); 
+#else
+    memcpy(rp, sp, sz * sizeof(real));
+#endif
+  } else {
+    TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;)
+  }
 }
 
 #define IMPLEMENT_THTensor_COPY(TYPENAMESRC, TYPE_SRC) \
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index d1e7420..b95d81f 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -2,18 +2,121 @@
 #define TH_GENERIC_FILE "generic/THTensorMath.c"
 #else
 
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
 #define TH_OMP_OVERHEAD_THRESHOLD 100000
 
+#ifdef _OPENMP
+
+#ifndef _WIN32
+#define PRAGMA(P) _Pragma(#P)
+#else
+#define PRAGMA(P) __pragma(P)
+#endif
+
+#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
+{ \
+  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR); \
+  PRAGMA(omp parallel if (TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD)) \
+  { \
+    size_t num_threads = omp_get_num_threads(); \
+    size_t tid = omp_get_thread_num(); \
+    ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
+    ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
+      TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
+    ptrdiff_t TENSOR##_len = TH_TENSOR_end - TH_TENSOR_offset; \
+    TYPE *TENSOR##_data = THTensor_(data)(TENSOR) + TH_TENSOR_offset; \
+    CODE \
+  } \
+}
+#else
+#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
+{ \
+  TYPE *TENSOR##_data = THTensor_(data)(TENSOR); \
+  ptrdiff_t TENSOR##_len = THTensor_(nElement)(TENSOR); \
+  CODE \
+}
+#endif
+
+#ifdef _OPENMP
+#define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+{ \
+  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
+  PRAGMA(omp parallel if (TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD)) \
+  { \
+    size_t num_threads = omp_get_num_threads(); \
+    size_t tid = omp_get_thread_num(); \
+    ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
+    ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
+      TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
+    ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \
+    TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1) + TH_TENSOR_offset; \
+    TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2) + TH_TENSOR_offset; \
+    CODE \
+  } \
+}
+#else
+#define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+{ \
+  TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1); \
+  TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2); \
+  ptrdiff_t TENSOR1##_len = THTensor_(nElement)(TENSOR1); \
+  CODE \
+}
+#endif
+
+#ifdef _OPENMP
+#define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
+{ \
+  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
+  PRAGMA(omp parallel if (TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD)) \
+  { \
+    size_t num_threads = omp_get_num_threads(); \
+    size_t tid = omp_get_thread_num(); \
+    ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
+    ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
+      TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
+    ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \
+    TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1) + TH_TENSOR_offset; \
+    TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2) + TH_TENSOR_offset; \
+    TYPE3 *TENSOR3##_data = THTensor_(data)(TENSOR3) + TH_TENSOR_offset; \
+    CODE \
+  } \
+}
+#else
+#define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
+{ \
+  TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1); \
+  TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2); \
+  TYPE3 *TENSOR3##_data = THTensor_(data)(TENSOR3); \
+  ptrdiff_t TENSOR1##_len = THTensor_(nElement)(TENSOR1); \
+  CODE \
+}
+#endif
+
 void THTensor_(fill)(THTensor *r_, real value)
 {
-  TH_TENSOR_APPLY(real, r_,
-                  THVector_(fill)(r__data, value, r__size); break;);
+  if (THTensor_(isContiguous)(r_) || THTensor_(isTransposed)(r_)) {
+    TH_TENSOR_APPLY_CONTIG(real, r_, THVector_(fill)(r__data, value, r__len););
+  } else {
+    TH_TENSOR_APPLY(real, r_,
+      if (r__stride == 1) {
+        THVector_(fill)(r__data, value, r__size);
+	r__i = r__size;
+	r__data += r__stride * r__size;
+	break;
+      } else {
+        *r__data = value;
+      }
+      );
+  }
 }
 
 void THTensor_(zero)(THTensor *r_)
 {
-  TH_TENSOR_APPLY(real, r_,
-                  THVector_(fill)(r__data, 0, r__size); break;);
+  THTensor_(fill)(r_, 0);
 }
 
 void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, real value)
@@ -405,9 +508,17 @@ accreal THTensor_(dot)(THTensor *tensor, THTensor *src)
 #undef th_isnan
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 #define th_isnan(val) \
-if (isnan(value)) break;
+(isnan(val))
 #else
-#define th_isnan(val)
+#define th_isnan(val) (0)
+#endif
+
+#undef th_isnan_break
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+#define th_isnan_break(val) \
+if (isnan(val)) break;
+#else
+#define th_isnan_break(val)
 #endif
 
 real THTensor_(minall)(THTensor *tensor)
@@ -423,7 +534,7 @@ real THTensor_(minall)(THTensor *tensor)
                   if(!(value >= theMin))
                   {
                     theMin = value;
-                    th_isnan(value)
+                    th_isnan_break(value)
                   });
   return theMin;
 }
@@ -441,7 +552,7 @@ real THTensor_(maxall)(THTensor *tensor)
                   if(!(value <= theMax))
                   {
                     theMax = value;
-                    th_isnan(value)
+                    th_isnan_break(value)
                   });
   return theMax;
 }
@@ -464,15 +575,9 @@ void THTensor_(add)(THTensor *r_, THTensor *t, real value)
 {
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-      real *tp = THTensor_(data)(t);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++)
-          rp[i] = tp[i] + value;
+    TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(adds)(r__data, t_data, value, r__len););
   } else {
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;);
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;);
   }
 }
 
@@ -485,15 +590,9 @@ void THTensor_(mul)(THTensor *r_, THTensor *t, real value)
 {
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-      real *tp = THTensor_(data)(t);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++)
-          rp[i] = tp[i] * value;
+    TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(muls)(r__data, t_data, value, r__len););
   } else {
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
   }
 }
 
@@ -501,22 +600,87 @@ void THTensor_(div)(THTensor *r_, THTensor *t, real value)
 {
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+    TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(divs)(r__data, t_data, value, r__len););
+  } else {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data / value;);
+  }
+}
+
+void THTensor_(lshift)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT)
+  return THTensor_(mul)(r_, t, powf(2, value));
+#elif defined(TH_REAL_IS_DOUBLE)
+  return THTensor_(mul)(r_, t, pow(2, value));
+#elif defined(TH_REAL_IS_HALF)
+  return THError("lshift is not supported for torch.HalfTensor");
+#else
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) &&
+      THTensor_(isContiguous)(t) &&
+      THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
       real *tp = THTensor_(data)(t);
       real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++)
-          rp[i] = tp[i] / value;
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+      for (i=0; i<sz; i++) {
+#if defined(TH_REAL_IS_BYTE)
+          rp[i] = ((real) tp[i]) << value;
+#else
+          rp[i] = ((unsigned real) tp[i]) << value;
+#endif
+      }
   } else {
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data / value;);
+#if defined(TH_REAL_IS_BYTE)
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((real) *t_data) << value););
+#else
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((unsigned real) *t_data) << value););
+#endif
+  }
+#endif
+}
+
+void THTensor_(rshift)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT)
+  return THTensor_(div)(r_, t, powf(2, value));
+#elif defined(TH_REAL_IS_DOUBLE)
+  return THTensor_(div)(r_, t, pow(2, value));
+#elif defined(TH_REAL_IS_HALF)
+  return THError("rshift is not supported for torch.HalfTensor");
+#else
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) &&
+      THTensor_(isContiguous)(t) &&
+      THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+      real *tp = THTensor_(data)(t);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+      for (i=0; i<sz; i++) {
+#if defined(TH_REAL_IS_BYTE)
+          rp[i] = ((real) tp[i]) >> value;
+#else
+          rp[i] = ((unsigned real) tp[i]) >> value;
+#endif
+      }
+  } else {
+#if defined(TH_REAL_IS_BYTE)
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((real) *t_data) >> value););
+#else
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((unsigned real) *t_data) >> value););
+#endif
   }
+#endif
 }
 
 void THTensor_(fmod)(THTensor *r_, THTensor *t, real value)
 {
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+
       real *tp = THTensor_(data)(t);
       real *rp = THTensor_(data)(r_);
       ptrdiff_t sz = THTensor_(nElement)(t);
@@ -564,20 +728,89 @@ void THTensor_(remainder)(THTensor *r_, THTensor *t, real value)
   }
 }
 
-void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value)
+void THTensor_(bitand)(THTensor *r_, THTensor *t, real value)
 {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  return THError("bitand is only supported for integer type tensors");
+#else
   THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+  if (THTensor_(isContiguous)(r_) &&
+      THTensor_(isContiguous)(t) &&
+      THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
       real *tp = THTensor_(data)(t);
       real *rp = THTensor_(data)(r_);
-      /* real t_val; */
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++)
-          rp[i] = (tp[i] < min_value) ? min_value : (tp[i] > max_value ? max_value : tp[i]);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+      for (i=0; i<sz; i++) {
+          rp[i] = tp[i] & value;
+      }
+  } else {
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data & value;);
+  }
+#endif
+}
+
+void THTensor_(bitor)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  return THError("bitor is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) &&
+      THTensor_(isContiguous)(t) &&
+      THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+      real *tp = THTensor_(data)(t);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+      for (i=0; i<sz; i++) {
+          rp[i] = tp[i] | value;
+      }
+  } else {
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data | value;);
+  }
+#endif
+}
+
+void THTensor_(bitxor)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  return THError("bitxor is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) &&
+      THTensor_(isContiguous)(t) &&
+      THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+      real *tp = THTensor_(data)(t);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+      for (i=0; i<sz; i++) {
+          rp[i] = tp[i] ^ value;
+      }
+  } else {
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data ^ value;);
+  }
+#endif
+}
+
+void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    /* real t_val; */
+    ptrdiff_t sz = THTensor_(nElement)(t);
+    ptrdiff_t i;
+    #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+    for (i=0; i<sz; i++)
+      rp[i] = (tp[i] < min_value) ? min_value : (tp[i] > max_value ? max_value : tp[i]);
   } else {
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
   }
 }
 
@@ -588,17 +821,10 @@ void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src)
     if(r_ == t) {
       THBlas_(axpy)(THTensor_(nElement)(t), value, THTensor_(data)(src), 1, THTensor_(data)(r_), 1);
     } else {
-      real *tp = THTensor_(data)(t);
-      real *sp = THTensor_(data)(src);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i< sz; i++)
-          rp[i] = tp[i] + value * sp[i];
+      TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cadd)(r__data, t_data, src_data, value, r__len););
     }
   } else {
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data + value * *src_data;);
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data + value * *src_data;);
   }
 }
 
@@ -611,16 +837,9 @@ void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src)
 {
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-      real *tp = THTensor_(data)(t);
-      real *sp = THTensor_(data)(src);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++)
-        rp[i] = tp[i] * sp[i];
+    TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cmul)(r__data, t_data, src_data, r__len););
   } else {
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * *src_data;);
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * *src_data;);
   }
 }
 
@@ -628,33 +847,106 @@ void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src)
 {
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+    real *tp = THTensor_(data)(t);
+    real *sp = THTensor_(data)(src);
+    real *rp = THTensor_(data)(r_);
+    ptrdiff_t sz = THTensor_(nElement)(t);
+    ptrdiff_t i;
+    #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+    for (i=0; i<sz; i++)
+      rp[i] = pow(tp[i], sp[i]);
+  } else {
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = pow(*t_data, *src_data););
+  }
+}
+
+void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+    TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cdiv)(r__data, t_data, src_data, r__len););
+  } else {
+    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / *src_data;);
+  }
+}
+
+void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_HALF)
+  return THError("clshift is not supported for torch.HalfTensor");
+#endif
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) &&
+      THTensor_(isContiguous)(t) &&
+      THTensor_(isContiguous)(src) &&
+      THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
       real *tp = THTensor_(data)(t);
       real *sp = THTensor_(data)(src);
       real *rp = THTensor_(data)(r_);
       ptrdiff_t sz = THTensor_(nElement)(t);
       ptrdiff_t i;
       #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++)
-        rp[i] = pow(tp[i], sp[i]);
+    for (i=0; i<sz; i++) {
+#if defined(TH_REAL_IS_FLOAT)
+      rp[i] = tp[i] * powf(2, sp[i]);
+#elif defined(TH_REAL_IS_DOUBLE)
+      rp[i] = tp[i] * pow(2, sp[i]);
+#elif defined(TH_REAL_IS_BYTE)
+      rp[i] = ((real) tp[i]) << sp[i];
+#else
+      rp[i] = ((unsigned real) tp[i]) << sp[i];
+#endif
+    }
   } else {
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = pow(*t_data, *src_data););
+#if defined(TH_REAL_IS_FLOAT)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * powf(2, *src_data););
+#elif defined(TH_REAL_IS_DOUBLE)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * pow(2, *src_data););
+#elif defined(TH_REAL_IS_BYTE)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((real)*t_data) << *src_data;);
+#else
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((unsigned real)*t_data) << *src_data;);
+#endif
   }
 }
 
-void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src)
+void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src)
 {
+#if defined(TH_REAL_IS_HALF)
+  return THError("crshift is not supported for torch.HalfTensor");
+#endif
   THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+  if (THTensor_(isContiguous)(r_) &&
+      THTensor_(isContiguous)(t) &&
+      THTensor_(isContiguous)(src) &&
+      THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
       real *tp = THTensor_(data)(t);
       real *sp = THTensor_(data)(src);
       real *rp = THTensor_(data)(r_);
       ptrdiff_t sz = THTensor_(nElement)(t);
       ptrdiff_t i;
       #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++)
-        rp[i] = tp[i] / sp[i];
+    for (i=0; i<sz; i++) {
+#if defined(TH_REAL_IS_FLOAT)
+      rp[i] = tp[i] / powf(2, sp[i]);
+#elif defined(TH_REAL_IS_DOUBLE)
+      rp[i] = tp[i] / pow(2, sp[i]);
+#elif defined(TH_REAL_IS_BYTE)
+      rp[i] = ((real) tp[i]) >> sp[i];
+#else
+      rp[i] = ((unsigned real) tp[i]) >> sp[i];
+#endif
+    }
   } else {
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / *src_data;);
+#if defined(TH_REAL_IS_FLOAT)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / powf(2, *src_data););
+#elif defined(TH_REAL_IS_DOUBLE)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / pow(2, *src_data););
+#elif defined(TH_REAL_IS_BYTE)
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((real)*t_data) >> *src_data;);
+#else
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((unsigned real)*t_data) >> *src_data;);
+#endif
   }
 }
 
@@ -713,19 +1005,94 @@ void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src)
   }
 }
 
-void THTensor_(tpow)(THTensor *r_, real value, THTensor *t)
+void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src)
 {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  return THError("cbitand is only supported for integer type tensors");
+#else
   THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+  if (THTensor_(isContiguous)(r_) &&
+      THTensor_(isContiguous)(t) &&
+      THTensor_(isContiguous)(src) &&
+      THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      ptrdiff_t sz = THTensor_(nElement)(t);
+      ptrdiff_t i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+    for (i=0; i<sz; i++) {
+      rp[i] = tp[i] & sp[i];
+    }
+  } else {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data & *src_data;);
+  }
+#endif
+}
+
+void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  return THError("cbitor is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) &&
+      THTensor_(isContiguous)(t) &&
+      THTensor_(isContiguous)(src) &&
+      THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      ptrdiff_t sz = THTensor_(nElement)(t);
+      ptrdiff_t i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+    for (i=0; i<sz; i++) {
+      rp[i] = tp[i] | sp[i];
+    }
+  } else {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data | *src_data;);
+  }
+#endif
+}
+
+void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  return THError("cbitxor is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) &&
+      THTensor_(isContiguous)(t) &&
+      THTensor_(isContiguous)(src) &&
+      THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
       real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
       real *rp = THTensor_(data)(r_);
       ptrdiff_t sz = THTensor_(nElement)(t);
       ptrdiff_t i;
       #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++)
-        rp[i] = pow(value, tp[i]);
+    for (i=0; i<sz; i++) {
+      rp[i] = tp[i] ^ sp[i];
+    }
+  } else {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data ^ *src_data;);
+  }
+#endif
+}
+
+void THTensor_(tpow)(THTensor *r_, real value, THTensor *t)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    ptrdiff_t sz = THTensor_(nElement)(t);
+    ptrdiff_t i;
+    #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+    for (i=0; i<sz; i++)
+      rp[i] = pow(value, tp[i]);
   } else {
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = pow(value, *t_data););
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = pow(value, *t_data););
   }
 }
 
@@ -1110,10 +1477,6 @@ ptrdiff_t THTensor_(numel)(THTensor *t)
 void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
 {
   THLongStorage *dim;
-  real theMax;
-  real value;
-  long theIndex;
-  long i;
 
   THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
       dimension + TH_INDEX_BASE);
@@ -1124,32 +1487,67 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
   THLongTensor_resize(indices_, dim, NULL);
   THLongStorage_free(dim);
 
-  TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
-                       theMax = t_data[0];
-                       theIndex = 0;
+  // two implementations optimized for data locality
+  if (t->stride[dimension] == 1) {
+    real theMax;
+    real value;
+    long theIndex;
+    long i;
+    TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
+                         theMax = t_data[0];
+                         theIndex = 0;
 
-                       for(i = 0; i < t_size; i++)
-                       {
-                         value = t_data[i*t_stride];
-                         /* This is not the same as value>theMax in the case of NaNs */
-                         if(!(value <= theMax))
+                         for(i = 0; i < t_size; i++)
                          {
-                           theIndex = i;
-                           theMax = value;
-                           th_isnan(value)
+                           value = t_data[i*t_stride];
+                           /* This is not the same as value>theMax in the case of NaNs */
+                           if(!(value <= theMax))
+                           {
+                             theIndex = i;
+                             theMax = value;
+                             th_isnan_break(value)
+                           }
                          }
-                       }
-                       *indices__data = theIndex;
-                       *values__data = theMax;);
+                         *indices__data = theIndex;
+                         *values__data = theMax;);
+  } else {
+    if (THTensor_(nDimension)(t) > 1) {
+      THTensor *t0 = THTensor_(newSelect)(t, dimension, 0);
+      THTensor_(copy)(values_, t0);
+      THTensor_(free)(t0);
+    } else {
+      THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
+    }
+    THLongTensor_zero(indices_);
+
+    if(t->size[dimension] == 1) {
+      return;
+    }
+
+    THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
+    // tempValues_.expand_as(t)
+    tempValues_->size[dimension] = t->size[dimension];
+    tempValues_->stride[dimension] = 0;
+
+    THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
+    // tempIndices_.expand_as(t)
+    tempIndices_->size[dimension] = t->size[dimension];
+    tempIndices_->stride[dimension] = 0;
+
+    TH_TENSOR_APPLY3_D(real, t, real, tempValues_, long, tempIndices_, dimension,
+                          if(!(*t_data <= *tempValues__data) && !th_isnan(*tempValues__data)) {
+                            *tempValues__data = *t_data;
+                            *tempIndices__data = *tempIndices__dimOffset;
+                          });
+
+    THTensor_(free)(tempValues_);
+    THLongTensor_free(tempIndices_);
+  }
 }
 
 void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
 {
   THLongStorage *dim;
-  real theMin;
-  real value;
-  long theIndex;
-  long i;
 
   THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
       dimension + TH_INDEX_BASE);
@@ -1160,23 +1558,59 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
   THLongTensor_resize(indices_, dim, NULL);
   THLongStorage_free(dim);
 
-  TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
-                       theMin = t_data[0];
-                       theIndex = 0;
+  // two implementations optimized for data locality
+  if (t->stride[dimension] == 1) {
+    real theMax;
+    real value;
+    long theIndex;
+    long i;
+    TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
+                         theMax = t_data[0];
+                         theIndex = 0;
 
-                       for(i = 0; i < t_size; i++)
-                       {
-                         value = t_data[i*t_stride];
-                         /* This is not the same as value<theMin in the case of NaNs */
-                         if(!(value >= theMin))
+                         for(i = 0; i < t_size; i++)
                          {
-                           theIndex = i;
-                           theMin = value;
-                           th_isnan(value)
+                           value = t_data[i*t_stride];
+                           /* This is not the same as value>theMax in the case of NaNs */
+                           if(!(value >= theMax))
+                           {
+                             theIndex = i;
+                             theMax = value;
+                             th_isnan_break(value)
+                           }
                          }
-                       }
-                       *indices__data = theIndex;
-                       *values__data = theMin;);
+                         *indices__data = theIndex;
+                         *values__data = theMax;);
+  } else {
+    if (THTensor_(nDimension)(t) > 1) {
+      THTensor *t0 = THTensor_(newSelect)(t, dimension, 0);
+      THTensor_(copy)(values_, t0);
+      THTensor_(free)(t0);
+    } else {
+      THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
+    }
+    THLongTensor_zero(indices_);
+
+    if(t->size[dimension] == 1) {
+      return;
+    }
+
+    THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
+    // tempValues_.expand_as(t)
+    tempValues_->size[dimension] = t->size[dimension];
+    tempValues_->stride[dimension] = 0;
+
+    THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
+    // tempIndices_.expand_as(t)
+    tempIndices_->size[dimension] = t->size[dimension];
+    tempIndices_->stride[dimension] = 0;
+
+    TH_TENSOR_APPLY3_D(real, t, real, tempValues_, long, tempIndices_, dimension,
+                          if(!(*t_data >= *tempValues__data) && !th_isnan(*tempValues__data)) {
+                            *tempValues__data = *t_data;
+                            *tempIndices__data = *tempIndices__dimOffset;
+                          });
+  }
 }
 
 
@@ -1192,12 +1626,24 @@ void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension)
   THTensor_(resize)(r_, dim, NULL);
   THLongStorage_free(dim);
 
-  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
-                       accreal sum = 0;
-                       long i;
-                       for(i = 0; i < t_size; i++)
-                         sum += t_data[i*t_stride];
-                       *r__data = (real)sum;);
+  // two implementations optimized for data locality
+  if (t->stride[dimension] == 1) {
+    TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                         accreal sum = 0;
+                         long i;
+                         for(i = 0; i < t_size; i++)
+                           sum += t_data[i*t_stride];
+                         *r__data = (real)sum;);
+  } else {
+    THTensor_(zero)(r_);
+    THTensor *temp_ = THTensor_(newWithTensor)(r_);
+    // r_.expand_as(t)
+    temp_->size[dimension] = t->size[dimension];
+    temp_->stride[dimension] = 0;
+
+    TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data + *t_data;);
+    THTensor_(free)(temp_);
+  }
 }
 
 void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension)
@@ -1212,13 +1658,24 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension)
   THTensor_(resize)(r_, dim, NULL);
   THLongStorage_free(dim);
 
-  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
-                       accreal prod = 1;
-                       long i;
-                       for(i = 0; i < t_size; i++)
-                         prod *= t_data[i*t_stride];
-                       *r__data = (real)prod;);
+  // two implementations optimized for data locality
+  if (t->stride[dimension] == 1) {
+    TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                         accreal prod = 1;
+                         long i;
+                         for(i = 0; i < t_size; i++)
+                           prod *= t_data[i*t_stride];
+                         *r__data = (real)prod;);
+  } else {
+    THTensor_(fill)(r_, 1);
+    THTensor *temp_ = THTensor_(newWithTensor)(r_);
+    // r_.expand_as(t)
+    temp_->size[dimension] = t->size[dimension];
+    temp_->stride[dimension] = 0;
 
+    TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data * *t_data;);
+    THTensor_(free)(temp_);
+  }
 }
 
 void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension)
@@ -1262,13 +1719,13 @@ void THTensor_(sign)(THTensor *r_, THTensor *t)
 
 #if defined (TH_REAL_IS_BYTE)
   TH_TENSOR_APPLY2(real, r_, real, t,
-		   if (*t_data > 0) *r__data = 1;
-		   else *r__data = 0;);
+    if (*t_data > 0) *r__data = 1;
+    else *r__data = 0;);
 #else
   TH_TENSOR_APPLY2(real, r_, real, t,
-		   if (*t_data > 0) *r__data = 1;
-		   else if (*t_data < 0) *r__data = -1;
-		   else *r__data = 0;);
+    if (*t_data > 0) *r__data = 1;
+    else if (*t_data < 0) *r__data = -1;
+    else *r__data = 0;);
 #endif
 }
 
@@ -1534,75 +1991,75 @@ static void THTensor_(quicksortascend)(real *arr, long *idx, long elements, long
 
   while(!done) {
       /* Use median of three for pivot choice */
-      P=(L+R)>>1;
-      BOTH_SWAP(P, L+1);
-      if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
-      if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
-      if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
+    P=(L+R)>>1;
+    BOTH_SWAP(P, L+1);
+    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
+    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
+    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
 
-      i = L+1; j = R; piv = ARR(L); pid = IDX(L);
+    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
 
-      do {
-          do { i = i+1; } while(ARR(i) < piv);
-          do { j = j-1; } while(ARR(j) > piv);
-          if (j < i)
-              break;
-          BOTH_SWAP(i, j);
-      } while(1);
-      BOTH_SWAP(L, j);
-      /* Left subfile is (L, j-1) */
-      /* Right subfile is (i, R) */
-      sz_left = j-L;
-      sz_right = R-i+1;
-      if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
-          /* both subfiles are small */
-          /* if stack empty */
-          if (stack == 0) {
-              done = 1;
-          } else {
-              stack--;
-              L = beg[stack];
-              R = end[stack];
-          }
-      } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
-              /* exactly one of the subfiles is small */
-              /* (L,R) = large subfile */
-              if (sz_left > sz_right) {
-                  /* Implicit: L = L; */
-                  R = j-1;
-              } else {
-                  L = i;
-                  /* Implicit: R = R; */
-              }
+    do {
+      do { i = i+1; } while(ARR(i) < piv);
+      do { j = j-1; } while(ARR(j) > piv);
+      if (j < i)
+          break;
+      BOTH_SWAP(i, j);
+    } while(1);
+    BOTH_SWAP(L, j);
+    /* Left subfile is (L, j-1) */
+    /* Right subfile is (i, R) */
+    sz_left = j-L;
+    sz_right = R-i+1;
+    if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
+      /* both subfiles are small */
+      /* if stack empty */
+      if (stack == 0) {
+        done = 1;
+      } else {
+        stack--;
+        L = beg[stack];
+        R = end[stack];
+      }
+    } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
+      /* exactly one of the subfiles is small */
+      /* (L,R) = large subfile */
+      if (sz_left > sz_right) {
+        /* Implicit: L = L; */
+        R = j-1;
       } else {
-          /* none of the subfiles is small */
-          /* push large subfile */
-          /* (L,R) = small subfile */
-          if (sz_left > sz_right) {
-              beg[stack] = L;
-              end[stack] = j-1;
-              stack++;
-              L = i;
-              /* Implicit: R = R */
-          } else {
-              beg[stack] = i;
-              end[stack] = R;
-              stack++;
-              /* Implicit: L = L; */
-              R = j-1;
-          }
+        L = i;
+        /* Implicit: R = R; */
       }
+    } else {
+      /* none of the subfiles is small */
+      /* push large subfile */
+      /* (L,R) = small subfile */
+      if (sz_left > sz_right) {
+        beg[stack] = L;
+        end[stack] = j-1;
+        stack++;
+        L = i;
+        /* Implicit: R = R */
+      } else {
+        beg[stack] = i;
+        end[stack] = R;
+        stack++;
+        /* Implicit: L = L; */
+        R = j-1;
+      }
+    }
   } /* while not done */
   /* Now insertion sort on the concatenation of subfiles */
   for(i=elements-2; i>=0; i--) {
     if (ARR(i) > ARR(i+1)) {
-          piv = ARR(i);
+      piv = ARR(i);
       pid = IDX(i);
       j = i+1;
       do {
-          ARR(j-1) = ARR(j);
-          IDX(j-1) = IDX(j);
-          j = j+1;
+        ARR(j-1) = ARR(j);
+        IDX(j-1) = IDX(j);
+        j = j+1;
       } while(j < elements && ARR(j) < piv);
       ARR(j-1) = piv;
       IDX(j-1) = pid;
@@ -1623,75 +2080,75 @@ static void THTensor_(quicksortdescend)(real *arr, long *idx, long elements, lon
 
   while(!done) {
       /* Use median of three for pivot choice */
-      P=(L+R)>>1;
-      BOTH_SWAP(P, L+1);
-      if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
-      if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
-      if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
+    P=(L+R)>>1;
+    BOTH_SWAP(P, L+1);
+    if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
+    if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
+    if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
 
-      i = L+1; j = R; piv = ARR(L); pid = IDX(L);
+    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
 
-      do {
-          do { i = i+1; } while(ARR(i) > piv);
-          do { j = j-1; } while(ARR(j) < piv);
-          if (j < i)
-              break;
-          BOTH_SWAP(i, j);
-      } while(1);
-      BOTH_SWAP(L, j);
-      /* Left subfile is (L, j-1) */
-      /* Right subfile is (i, R) */
-      sz_left = j-L;
-      sz_right = R-i+1;
-      if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
-          /* both subfiles are small */
-          /* if stack empty */
-          if (stack == 0) {
-              done = 1;
-          } else {
-              stack--;
-              L = beg[stack];
-              R = end[stack];
-          }
-      } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
-              /* exactly one of the subfiles is small */
-              /* (L,R) = large subfile */
-              if (sz_left > sz_right) {
-                  /* Implicit: L = L; */
-                  R = j-1;
-              } else {
-                  L = i;
-                  /* Implicit: R = R; */
-              }
+    do {
+      do { i = i+1; } while(ARR(i) > piv);
+      do { j = j-1; } while(ARR(j) < piv);
+      if (j < i)
+          break;
+      BOTH_SWAP(i, j);
+    } while(1);
+    BOTH_SWAP(L, j);
+    /* Left subfile is (L, j-1) */
+    /* Right subfile is (i, R) */
+    sz_left = j-L;
+    sz_right = R-i+1;
+    if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
+      /* both subfiles are small */
+      /* if stack empty */
+      if (stack == 0) {
+        done = 1;
+      } else {
+        stack--;
+        L = beg[stack];
+        R = end[stack];
+      }
+    } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
+      /* exactly one of the subfiles is small */
+      /* (L,R) = large subfile */
+      if (sz_left > sz_right) {
+        /* Implicit: L = L; */
+        R = j-1;
+      } else {
+        L = i;
+        /* Implicit: R = R; */
+      }
+    } else {
+      /* none of the subfiles is small */
+      /* push large subfile */
+      /* (L,R) = small subfile */
+      if (sz_left > sz_right) {
+        beg[stack] = L;
+        end[stack] = j-1;
+        stack++;
+        L = i;
+        /* Implicit: R = R */
       } else {
-          /* none of the subfiles is small */
-          /* push large subfile */
-          /* (L,R) = small subfile */
-          if (sz_left > sz_right) {
-              beg[stack] = L;
-              end[stack] = j-1;
-              stack++;
-              L = i;
-              /* Implicit: R = R */
-          } else {
-              beg[stack] = i;
-              end[stack] = R;
-              stack++;
-              /* Implicit: L = L; */
-              R = j-1;
-          }
+        beg[stack] = i;
+        end[stack] = R;
+        stack++;
+        /* Implicit: L = L; */
+        R = j-1;
       }
+    }
   } /* while not done */
   /* Now insertion sort on the concatenation of subfiles */
   for(i=elements-2; i>=0; i--) {
     if (ARR(i) < ARR(i+1)) {
-          piv = ARR(i);
+      piv = ARR(i);
       pid = IDX(i);
       j = i+1;
       do {
-          ARR(j-1) = ARR(j);
-          IDX(j-1) = IDX(j);
-          j = j+1;
+        ARR(j-1) = ARR(j);
+        IDX(j-1) = IDX(j);
+        j = j+1;
       } while(j < elements && ARR(j) > piv);
       ARR(j-1) = piv;
       IDX(j-1) = pid;
@@ -2044,7 +2501,9 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
   int maxDim = dimension + 1;
   int allEmpty = 1;
   int allContiguous = 1;
-  int ldimension = dimension;
+
+  // cat_dimension is the actual dimension we cat along
+  int cat_dimension = dimension;
 
   for (i = 0; i < numInputs; i++)
   {
@@ -2053,13 +2512,13 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
 
   // When the user input dimension is -1 (i.e. -2 in C)
   // Then we pick the maximum last dimension across all tensors.
-  if ( dimension == -2 )
+  if ( dimension + TH_INDEX_BASE == -1 )
   {
-    ldimension = maxDim?(maxDim-1):0;
+    cat_dimension = maxDim?(maxDim-1):0;
   }
 
   THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
-  THArgCheck(ldimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
+  THArgCheck(cat_dimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
 
   size = THLongStorage_newWithSize(maxDim);
 
@@ -2067,7 +2526,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
   {
     // dimSize is either the size of the dim if it exists, either 1 if #dim > 0, otherwise 0
     long dimSize = i < inputs[0]->nDimension ? inputs[0]->size[i] : THMin(inputs[0]->nDimension, 1);
-    if (i == ldimension)
+    if (i == cat_dimension)
     {
       for (j = 1; j < numInputs; j++)
       {
@@ -2114,7 +2573,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
 
     // First path is for contiguous inputs along dim 1
     // Second path for non-contiguous
-    if (ldimension == 0 && allContiguous)
+    if (cat_dimension == 0 && allContiguous)
     {
       real* result_data = result->storage->data + result->storageOffset;
       offset = 0;
@@ -2137,9 +2596,9 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
       {
         if (inputs[j]->nDimension)
         {
-          long dimSize = ldimension < inputs[j]->nDimension ? inputs[j]->size[ldimension] : 1;
+          long dimSize = cat_dimension < inputs[j]->nDimension ? inputs[j]->size[cat_dimension] : 1;
           THTensor *nt = THTensor_(newWithTensor)(result);
-          THTensor_(narrow)(nt, NULL, ldimension, offset, dimSize);
+          THTensor_(narrow)(nt, NULL, cat_dimension, offset, dimSize);
           THTensor_(copy)(nt, inputs[j]);
           THTensor_(free)(nt);
           offset += dimSize;
@@ -2178,25 +2637,25 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
 #define TENSOR_IMPLEMENT_LOGICAL(NAME,OP)				\
   void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, real value)	\
   {									\
-    THByteTensor_rawResize(r_, t->nDimension, t->size, NULL);		\
+    THByteTensor_resizeNd(r_, t->nDimension, t->size, NULL);		\
     TH_TENSOR_APPLY2(unsigned char, r_, real, t,			\
 		     *r__data = (*t_data OP value) ? 1 : 0;); \
   }									\
   void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, real value)	\
   {									\
-    THTensor_(rawResize)(r_, t->nDimension, t->size, NULL);		\
+    THTensor_(resizeNd)(r_, t->nDimension, t->size, NULL);		\
     TH_TENSOR_APPLY2(real, r_, real, t,					\
 		     *r__data = (*t_data OP value) ? 1 : 0;); \
   }									\
   void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
   {									\
-    THByteTensor_rawResize(r_, ta->nDimension, ta->size, NULL);		\
+    THByteTensor_resizeNd(r_, ta->nDimension, ta->size, NULL);		\
     TH_TENSOR_APPLY3(unsigned char, r_, real, ta, real, tb,		\
 		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
   }									\
   void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
   {									\
-    THTensor_(rawResize)(r_, ta->nDimension, ta->size, NULL);		\
+    THTensor_(resizeNd)(r_, ta->nDimension, ta->size, NULL);		\
     TH_TENSOR_APPLY3(real, r_, real, ta, real, tb,			\
 		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
   }									\
@@ -2290,22 +2749,11 @@ void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight)
 
 void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension)
 {
-  THLongStorage *dim;
-
   THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d",
       dimension + TH_INDEX_BASE);
 
-  dim = THTensor_(newSizeOf)(t);
-  THLongStorage_set(dim, dimension, 1);
-  THTensor_(resize)(r_, dim, NULL);
-  THLongStorage_free(dim);
-
-  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
-                       accreal sum = 0;
-                       long i;
-                       for(i = 0; i < t_size; i++)
-                         sum += t_data[i*t_stride];
-                       *r__data = (real)sum/t_size;);
+  THTensor_(sum)(r_, t, dimension);
+  THTensor_(div)(r_, r_, t->size[dimension]);
 }
 
 void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag)
@@ -2491,7 +2939,7 @@ accreal THTensor_(dist)(THTensor *tensor, THTensor *src, real value)
 {
   real sum = 0;
   TH_TENSOR_APPLY2(real, tensor, real, src,
-	sum += pow(fabs(*tensor_data - *src_data), value);)
+  sum += pow(fabs(*tensor_data - *src_data), value);)
   return pow(sum, 1.0/value);
 }
 
diff --git a/lib/TH/generic/THTensorMath.h b/lib/TH/generic/THTensorMath.h
index c656dfd..ff994ed 100644
--- a/lib/TH/generic/THTensorMath.h
+++ b/lib/TH/generic/THTensorMath.h
@@ -34,17 +34,27 @@ TH_API void THTensor_(add)(THTensor *r_, THTensor *t, real value);
 TH_API void THTensor_(sub)(THTensor *self, THTensor *src, real value);
 TH_API void THTensor_(mul)(THTensor *r_, THTensor *t, real value);
 TH_API void THTensor_(div)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(lshift)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(rshift)(THTensor *r_, THTensor *t, real value);
 TH_API void THTensor_(fmod)(THTensor *r_, THTensor *t, real value);
 TH_API void THTensor_(remainder)(THTensor *r_, THTensor *t, real value);
 TH_API void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value);
+TH_API void THTensor_(bitand)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(bitor)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(bitxor)(THTensor *r_, THTensor *t, real value);
 
 TH_API void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src);
 TH_API void THTensor_(csub)(THTensor *self, THTensor *src1, real value, THTensor *src2);
 TH_API void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src);
 TH_API void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src);
 TH_API void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src);
 TH_API void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src);
 TH_API void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src);
 
 TH_API void THTensor_(addcmul)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2);
 TH_API void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2);
diff --git a/lib/TH/generic/THVector.h b/lib/TH/generic/THVector.h
index 67fdcfa..7d36854 100644
--- a/lib/TH/generic/THVector.h
+++ b/lib/TH/generic/THVector.h
@@ -3,10 +3,13 @@
 #else
 
 TH_API void THVector_(fill)(real *x, const real c, const ptrdiff_t n);
-TH_API void THVector_(add)(real *y, const real *x, const real c, const ptrdiff_t n);
-TH_API void THVector_(diff)(real *z, const real *x, const real *y, const ptrdiff_t n);
-TH_API void THVector_(scale)(real *y, const real c, const ptrdiff_t n);
-TH_API void THVector_(mul)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(cadd)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n);
+TH_API void THVector_(adds)(real *y, const real *x, const real c, const ptrdiff_t n);
+TH_API void THVector_(cmul)(real *z, const real *x, const real *y, const ptrdiff_t n);
+TH_API void THVector_(muls)(real *y, const real *x, const real c, const ptrdiff_t n);
+TH_API void THVector_(cdiv)(real *z, const real *x, const real *y, const ptrdiff_t n);
+TH_API void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n);
+TH_API void THVector_(copy)(real *y, const real *x, const ptrdiff_t n);
 
 /* Initialize the dispatch pointers */
 TH_API void THVector_(vectorDispatchInit)(void);
diff --git a/lib/TH/generic/THVectorDefault.c b/lib/TH/generic/THVectorDefault.c
index aabc16c..3388e0d 100644
--- a/lib/TH/generic/THVectorDefault.c
+++ b/lib/TH/generic/THVectorDefault.c
@@ -2,10 +2,25 @@
 #define TH_GENERIC_FILE "generic/THVectorDefault.c"
 #else
 
+void THVector_(copy_DEFAULT)(real *x, const real *y, const ptrdiff_t n) {
+  ptrdiff_t i = 0;
+
+  for(; i <n-4; i+=4)
+  {
+    x[i] = y[i];
+    x[i+1] = y[i+1];
+    x[i+2] = y[i+2];
+    x[i+3] = y[i+3];
+  }
+
+  for(; i < n; i++)
+    x[i] = y[i];
+}
+
 void THVector_(fill_DEFAULT)(real *x, const real c, const ptrdiff_t n) {
   ptrdiff_t i = 0;
 
-  for(; i < n-4; i += 4)
+  for(; i <n-4; i+=4)
   {
     x[i] = c;
     x[i+1] = c;
@@ -17,68 +32,100 @@ void THVector_(fill_DEFAULT)(real *x, const real c, const ptrdiff_t n) {
     x[i] = c;
 }
 
-void THVector_(add_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
+void THVector_(cadd_DEFAULT)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n)
+{
+  ptrdiff_t i = 0;
+
+  for(; i<n-4; i+=4)
+  {
+    z[i] = x[i] + c * y[i];
+    z[i+1] = x[i+1] + c * y[i+1];
+    z[i+2] = x[i+2] + c * y[i+2];
+    z[i+3] = x[i+3] + c * y[i+3];
+  }
+
+  for(; i<n; i++)
+    z[i] = x[i] + c * y[i];
+}
+
+void THVector_(adds_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
+{
+  ptrdiff_t i = 0;
+
+  for(; i<n-4; i+=4)
+  {
+    y[i] = x[i] + c;
+    y[i+1] = x[i+1] + c;
+    y[i+2] = x[i+2] + c;
+    y[i+3] = x[i+3] + c;
+  }
+
+  for(; i<n; i++)
+    y[i] = x[i] + c;
+}
+
+void THVector_(cmul_DEFAULT)(real *z, const real *x, const real *y, const ptrdiff_t n)
 {
   ptrdiff_t i = 0;
 
-  for(;i < n-4; i += 4)
+  for(; i <n-4; i+=4)
   {
-    y[i] += c * x[i];
-    y[i+1] += c * x[i+1];
-    y[i+2] += c * x[i+2];
-    y[i+3] += c * x[i+3];
+    z[i] = x[i] * y[i];
+    z[i+1] = x[i+1] * y[i+1];
+    z[i+2] = x[i+2] * y[i+2];
+    z[i+3] = x[i+3] * y[i+3];
   }
 
   for(; i < n; i++)
-    y[i] += c * x[i];
+    z[i] = x[i] * y[i];
 }
 
-void THVector_(diff_DEFAULT)(real *z, const real *x, const real *y, const ptrdiff_t n)
+void THVector_(muls_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
 {
   ptrdiff_t i = 0;
 
-  for(; i < n-4; i += 4)
+  for(; i <n-4; i+=4)
   {
-    z[i] = x[i] - y[i];
-    z[i+1] = x[i+1] - y[i+1];
-    z[i+2] = x[i+2] - y[i+2];
-    z[i+3] = x[i+3] - y[i+3];
+    y[i] = x[i] * c;
+    y[i+1] = x[i+1] * c;
+    y[i+2] = x[i+2] * c;
+    y[i+3] = x[i+3] * c;
   }
 
   for(; i < n; i++)
-    z[i] = x[i] - y[i];
+    y[i] = x[i] * c;
 }
 
-void THVector_(scale_DEFAULT)(real *y, const real c, const ptrdiff_t n)
+void THVector_(cdiv_DEFAULT)(real *z, const real *x, const real *y, const ptrdiff_t n)
 {
   ptrdiff_t i = 0;
 
-  for(; i < n-4; i +=4)
+  for(; i<n-4; i+=4)
   {
-    y[i] *= c;
-    y[i+1] *= c;
-    y[i+2] *= c;
-    y[i+3] *= c;
+    z[i] = x[i] / y[i];
+    z[i+1] = x[i+1] / y[i+1];
+    z[i+2] = x[i+2] / y[i+2];
+    z[i+3] = x[i+3] / y[i+3];
   }
 
   for(; i < n; i++)
-    y[i] *= c;
+    z[i] = x[i] / y[i];
 }
 
-void THVector_(mul_DEFAULT)(real *y, const real *x, const ptrdiff_t n)
+void THVector_(divs_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
 {
   ptrdiff_t i = 0;
 
-  for(; i < n-4; i += 4)
+  for(; i<n-4; i+=4)
   {
-    y[i] *= x[i];
-    y[i+1] *= x[i+1];
-    y[i+2] *= x[i+2];
-    y[i+3] *= x[i+3];
+    y[i] = x[i] / c;
+    y[i+1] = x[i+1] / c;
+    y[i+2] = x[i+2] / c;
+    y[i+3] = x[i+3] / c;
   }
 
   for(; i < n; i++)
-    y[i] *= x[i];
+    y[i] = x[i] / c;
 }
 
 #endif
diff --git a/lib/TH/generic/THVectorDispatch.c b/lib/TH/generic/THVectorDispatch.c
index 2436a12..5b88852 100644
--- a/lib/TH/generic/THVectorDispatch.c
+++ b/lib/TH/generic/THVectorDispatch.c
@@ -26,6 +26,12 @@ static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = {
     #endif
   #endif
 
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(fill_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+
   #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
           || defined(USE_SSE4_1) || defined(USE_SSE4_2)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
@@ -38,115 +44,199 @@ void THVector_(fill)(real *x, const real c, const ptrdiff_t n) {
   THVector_(fill_DISPATCHPTR)(x, c, n);
 }
 
-static void (*THVector_(add_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(add_DEFAULT);
-static FunctionDescription THVector_(add_DISPATCHTABLE)[] = {
+static void (*THVector_(cadd_DISPATCHPTR))(real *, const real *, const real *, const real, const ptrdiff_t) = &THVector_(cadd_DEFAULT);
+static FunctionDescription THVector_(cadd_DISPATCHTABLE)[] = {
   #if defined(__NEON__)
     #if defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(add_NEON), SIMDExtension_NEON),
+      FUNCTION_IMPL(THVector_(cadd_NEON), SIMDExtension_NEON),
     #endif
   #endif
 
-  #if defined(__PPC64__)
+  #if defined(USE_AVX2)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(add_VSX), SIMDExtension_VSX),
+      FUNCTION_IMPL(THVector_(cadd_AVX2), SIMDExtension_AVX2),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cadd_AVX), SIMDExtension_AVX),
     #endif
   #endif
 
   #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
           || defined(USE_SSE4_1) || defined(USE_SSE4_2)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(add_SSE), SIMDExtension_SSE),
+      FUNCTION_IMPL(THVector_(cadd_SSE), SIMDExtension_SSE),
     #endif
   #endif
 
-  FUNCTION_IMPL(THVector_(add_DEFAULT), SIMDExtension_DEFAULT)
+  FUNCTION_IMPL(THVector_(cadd_DEFAULT), SIMDExtension_DEFAULT)
 };
-void THVector_(add)(real *y, const real *x, const real c, const ptrdiff_t n) {
-  THVector_(add_DISPATCHPTR)(y, x, c, n);
+void THVector_(cadd)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n) {
+  THVector_(cadd_DISPATCHPTR)(z, x, y, c, n);
 }
 
-
-static void (*THVector_(diff_DISPATCHPTR))(real *, const real *, const real *, const ptrdiff_t) = &THVector_(diff_DEFAULT);
-static FunctionDescription THVector_(diff_DISPATCHTABLE)[] = {
+static void (*THVector_(adds_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(adds_DEFAULT);
+static FunctionDescription THVector_(adds_DISPATCHTABLE)[] = {
   #if defined(__NEON__)
     #if defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(diff_NEON), SIMDExtension_NEON),
+      FUNCTION_IMPL(THVector_(adds_NEON), SIMDExtension_NEON),
     #endif
   #endif
 
   #if defined(__PPC64__)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(diff_VSX), SIMDExtension_VSX),
+      FUNCTION_IMPL(THVector_(adds_VSX), SIMDExtension_VSX),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(adds_AVX), SIMDExtension_AVX),
     #endif
   #endif
 
   #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
           || defined(USE_SSE4_1) || defined(USE_SSE4_2)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(diff_SSE), SIMDExtension_SSE),
+      FUNCTION_IMPL(THVector_(adds_SSE), SIMDExtension_SSE),
     #endif
   #endif
 
-  FUNCTION_IMPL(THVector_(diff_DEFAULT), SIMDExtension_DEFAULT)
+  FUNCTION_IMPL(THVector_(adds_DEFAULT), SIMDExtension_DEFAULT)
 };
-void THVector_(diff)(real *z, const real *x, const real *y, const ptrdiff_t n) {
-  THVector_(diff_DISPATCHPTR)(z, x, y, n);
+// Dispatch stubs that just call the pointers
+TH_API void THVector_(adds)(real *r_, const real *t, const real value, const ptrdiff_t n) {
+  THVector_(adds_DISPATCHPTR)(r_, t, value, n);
 }
 
+static void (*THVector_(cmul_DISPATCHPTR))(real *, const real *, const real *, const ptrdiff_t) = &THVector_(cmul_DEFAULT);
+static FunctionDescription THVector_(cmul_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cmul_NEON), SIMDExtension_NEON),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cmul_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cmul_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(cmul_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(cmul)(real *z, const real *x, const real *y, const ptrdiff_t n) {
+  THVector_(cmul_DISPATCHPTR)(z, x, y, n);
+}
 
-static void (*THVector_(scale_DISPATCHPTR))(real *, const real, const ptrdiff_t) = &THVector_(scale_DEFAULT);
-static FunctionDescription THVector_(scale_DISPATCHTABLE)[] = {
+static void (*THVector_(muls_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(muls_DEFAULT);
+static FunctionDescription THVector_(muls_DISPATCHTABLE)[] = {
   #if defined(__NEON__)
     #if defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(scale_NEON), SIMDExtension_NEON),
+      FUNCTION_IMPL(THVector_(muls_NEON), SIMDExtension_NEON),
     #endif
   #endif
 
   #if defined(__PPC64__)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(scale_VSX), SIMDExtension_VSX),
+      FUNCTION_IMPL(THVector_(muls_VSX), SIMDExtension_VSX),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(muls_AVX), SIMDExtension_AVX),
     #endif
   #endif
 
   #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
           || defined(USE_SSE4_1) || defined(USE_SSE4_2)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(scale_SSE), SIMDExtension_SSE),
+      FUNCTION_IMPL(THVector_(muls_SSE), SIMDExtension_SSE),
     #endif
   #endif
 
-  FUNCTION_IMPL(THVector_(scale_DEFAULT), SIMDExtension_DEFAULT)
+  FUNCTION_IMPL(THVector_(muls_DEFAULT), SIMDExtension_DEFAULT)
 };
-TH_API void THVector_(scale)(real *y, const real c, const ptrdiff_t n) {
-  THVector_(scale_DISPATCHPTR)(y, c, n);
+void THVector_(muls)(real *y, const real *x, const real c, const ptrdiff_t n) {
+  THVector_(muls_DISPATCHPTR)(y, x, c, n);
 }
 
+static void (*THVector_(cdiv_DISPATCHPTR))(real *, const real *, const real *, const ptrdiff_t) = &THVector_(cdiv_DEFAULT);
+static FunctionDescription THVector_(cdiv_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cdiv_NEON), SIMDExtension_NEON),
+    #endif
+  #endif
+
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cdiv_AVX), SIMDExtension_AVX),
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(cdiv_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(cdiv_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(cdiv)(real *z, const real *x, const real *y, const ptrdiff_t n) {
+  THVector_(cdiv_DISPATCHPTR)(z, x, y, n);
+}
 
-static void (*THVector_(mul_DISPATCHPTR))(real *, const real *, const ptrdiff_t) = &THVector_(mul_DEFAULT);
-static FunctionDescription THVector_(mul_DISPATCHTABLE)[] = {
+static void (*THVector_(divs_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(divs_DEFAULT);
+static FunctionDescription THVector_(divs_DISPATCHTABLE)[] = {
   #if defined(__NEON__)
     #if defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(mul_NEON), SIMDExtension_NEON),
+      FUNCTION_IMPL(THVector_(divs_NEON), SIMDExtension_NEON),
     #endif
   #endif
 
-  #if defined(__PPC64__)
+  #if defined(USE_AVX)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(mul_VSX), SIMDExtension_VSX),
+      FUNCTION_IMPL(THVector_(divs_AVX), SIMDExtension_AVX),
     #endif
   #endif
 
   #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
           || defined(USE_SSE4_1) || defined(USE_SSE4_2)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(mul_SSE), SIMDExtension_SSE),
+      FUNCTION_IMPL(THVector_(divs_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(divs_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n) {
+  THVector_(divs_DISPATCHPTR)(y, x, c, n);
+}
+
+static void (*THVector_(copy_DISPATCHPTR))(real *, const real *, const ptrdiff_t) = &THVector_(copy_DEFAULT);
+static FunctionDescription THVector_(copy_DISPATCHTABLE)[] = {
+  #if defined(USE_AVX)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(copy_AVX), SIMDExtension_AVX),
     #endif
   #endif
 
-  FUNCTION_IMPL(THVector_(mul_DEFAULT), SIMDExtension_DEFAULT)
+  FUNCTION_IMPL(THVector_(copy_DEFAULT), SIMDExtension_DEFAULT)
 };
-void THVector_(mul)(real *y, const real *x, const ptrdiff_t n) {
-  THVector_(mul_DISPATCHPTR);
+void THVector_(copy)(real *y, const real *x, const ptrdiff_t n) {
+  THVector_(copy_DISPATCHPTR)(y, x, n);
 }
 
 /* This needs to be called in order to initialize the dispatch pointers at runtime.
@@ -160,10 +250,13 @@ void THVector_(vectorDispatchInit)(void)
 {
   uint32_t hostSimdExts = detectHostSIMDExtensions();
   INIT_DISPATCH_PTR(fill);
-  INIT_DISPATCH_PTR(add);
-  INIT_DISPATCH_PTR(diff);
-  INIT_DISPATCH_PTR(scale);
-  INIT_DISPATCH_PTR(mul);
+  INIT_DISPATCH_PTR(cadd);
+  INIT_DISPATCH_PTR(adds);
+  INIT_DISPATCH_PTR(cmul);
+  INIT_DISPATCH_PTR(muls);
+  INIT_DISPATCH_PTR(cdiv);
+  INIT_DISPATCH_PTR(divs);
+  INIT_DISPATCH_PTR(copy);
 }
 
 #endif
diff --git a/lib/TH/generic/simd/convolve.c b/lib/TH/generic/simd/convolve.c
index 842af17..bf07bbe 100644
--- a/lib/TH/generic/simd/convolve.c
+++ b/lib/TH/generic/simd/convolve.c
@@ -1,4 +1,4 @@
-#if defined(USE_AVX)
+#if defined(__AVX__)
 
 #ifdef _MSC_VER
 #include <intrin.h>
@@ -113,7 +113,7 @@ void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows,
 void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
 
 void convolve_5x5(float* output, float* input, float* kernel, long outRows, long outCols, long inCols) {
-#if defined(USE_AVX)
+#if defined(__AVX__)
   int avx = haveCPUFeature(kCPUFeature_AVX);
   if (avx)
   {
@@ -124,4 +124,4 @@ void convolve_5x5(float* output, float* input, float* kernel, long outRows, long
   {
     convolve_5x5_sse(output, input, kernel, outRows, outCols, outCols, inCols);
   }
-}
\ No newline at end of file
+}
diff --git a/lib/TH/generic/simd/simd.h b/lib/TH/generic/simd/simd.h
index aa3b722..19d41b1 100644
--- a/lib/TH/generic/simd/simd.h
+++ b/lib/TH/generic/simd/simd.h
@@ -2,14 +2,15 @@
 #define TH_SIMD_INC
 
 #include <stdint.h>
+#include <stdlib.h>
 #if defined(_MSC_VER)
 #include <intrin.h>
-#elif defined(HAVE_GCC_GET_CPUID)
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
 #include <cpuid.h>
 #endif
 
 // Can be found on Intel ISA Reference for CPUID
-#define CPUID_AVX2_BIT 0x10       // Bit 5 of EBX for EAX=0x7
+#define CPUID_AVX2_BIT 0x20       // Bit 5 of EBX for EAX=0x7
 #define CPUID_AVX_BIT  0x10000000 // Bit 28 of ECX for EAX=0x1
 #define CPUID_SSE_BIT  0x2000000  // bit 25 of EDX for EAX=0x1
 
@@ -99,13 +100,13 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *
   *ebx = cpuInfo[1];
   *ecx = cpuInfo[2];
   *edx = cpuInfo[3];
-#elif defined(HAVE_GCC_GET_CPUID)
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
   uint32_t level = *eax;
   __get_cpuid (level, eax, ebx, ecx, edx);
 #else
-  uint32_t a = *eax, b, c, d;
+  uint32_t a = *eax, b, c = *ecx, d;
   asm volatile ( "cpuid\n\t"
-                 : "+a"(a), "=b"(b), "=c"(c), "=d"(d) );
+		 : "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
   *eax = a;
   *ebx = b;
   *ecx = c;
@@ -117,19 +118,38 @@ static inline uint32_t detectHostSIMDExtensions()
 {
   uint32_t eax, ebx, ecx, edx;
   uint32_t hostSimdExts = 0x0;
+  int TH_NO_AVX = 1, TH_NO_AVX2 = 1, TH_NO_SSE = 1;
+  char *evar;
+
+  evar = getenv("TH_NO_AVX2");
+  if (evar == NULL || strncmp(evar, "1", 2) != 0)
+    TH_NO_AVX2 = 0;
 
   // Check for AVX2. Requires separate CPUID
   eax = 0x7;
+  ecx = 0x0;
   cpuid(&eax, &ebx, &ecx, &edx);
-  if (ebx & CPUID_AVX2_BIT)
+  if ((ebx & CPUID_AVX2_BIT) && TH_NO_AVX2 == 0) {
     hostSimdExts |= SIMDExtension_AVX2;
+  }
 
+  // Detect and enable AVX and SSE
   eax = 0x1;
   cpuid(&eax, &ebx, &ecx, &edx);
-  if (ecx & CPUID_AVX_BIT)
+
+  evar = getenv("TH_NO_AVX");
+  if (evar == NULL || strncmp(evar, "1", 2) != 0)
+    TH_NO_AVX = 0;
+  if (ecx & CPUID_AVX_BIT && TH_NO_AVX == 0) {
     hostSimdExts |= SIMDExtension_AVX;
-  if (edx & CPUID_SSE_BIT)
+  }
+
+  evar = getenv("TH_NO_SSE");
+  if (evar == NULL || strncmp(evar, "1", 2) != 0)
+    TH_NO_SSE = 0;  
+  if (edx & CPUID_SSE_BIT && TH_NO_SSE == 0) {
     hostSimdExts |= SIMDExtension_SSE;
+  }
 
   return hostSimdExts;
 }
diff --git a/lib/TH/vector/AVX.c b/lib/TH/vector/AVX.c
new file mode 100644
index 0000000..b7d5dd1
--- /dev/null
+++ b/lib/TH/vector/AVX.c
@@ -0,0 +1,274 @@
+#if defined(__AVX__)
+#ifndef _MSC_VER
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif
+
+#include "AVX.h"
+
+void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n) {
+  ptrdiff_t i;
+  ptrdiff_t off;
+  for (i=0; i<=((n)-8); i+=8) {
+    _mm256_storeu_pd(y+i, _mm256_loadu_pd(x+i));
+    _mm256_storeu_pd(y+i+4, _mm256_loadu_pd(x+i+4));
+  }
+  off = (n) - ((n)%8);
+  for (i=0; i<((n)%8); i++) {
+    y[off+i] = x[off+i];
+  }
+}
+
+void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  ptrdiff_t off;
+  __m256d YMM0 = _mm256_set_pd(c, c, c, c);
+  for (i=0; i<=((n)-16); i+=16) {
+    _mm256_storeu_pd((x)+i  , YMM0);
+    _mm256_storeu_pd((x)+i+4, YMM0);
+    _mm256_storeu_pd((x)+i+8, YMM0);
+    _mm256_storeu_pd((x)+i+12, YMM0);
+  }
+  off = (n) - ((n)%16);
+  for (i=0; i<((n)%16); i++) {
+    x[off+i] = c;
+  }
+}
+
+void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(x+i);
+    YMM1 = _mm256_loadu_pd(x+i+4);
+    YMM2 = _mm256_loadu_pd(y+i);
+    YMM3 = _mm256_loadu_pd(y+i+4);
+    YMM2 = _mm256_div_pd(YMM0, YMM2);
+    YMM3 = _mm256_div_pd(YMM1, YMM3);
+    _mm256_storeu_pd(z+i, YMM2);
+    _mm256_storeu_pd(z+i+4, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
+void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+  __m256d YMM0, YMM1;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(x+i);
+    YMM1 = _mm256_loadu_pd(x+i+4);
+    YMM0 = _mm256_div_pd(YMM0, YMM15);
+    YMM1 = _mm256_div_pd(YMM1, YMM15);
+    _mm256_storeu_pd(y+i, YMM0);
+    _mm256_storeu_pd(y+i+4, YMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] / c;
+  }
+}
+
+void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(x+i);
+    YMM1 = _mm256_loadu_pd(x+i+4);
+    YMM2 = _mm256_loadu_pd(y+i);
+    YMM3 = _mm256_loadu_pd(y+i+4);
+    YMM2 = _mm256_mul_pd(YMM0, YMM2);
+    YMM3 = _mm256_mul_pd(YMM1, YMM3);
+    _mm256_storeu_pd(z+i, YMM2);
+    _mm256_storeu_pd(z+i+4, YMM3);
+  }
+  for (; i<n; i++) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+  __m256d YMM0, YMM1;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(x+i);
+    YMM1 = _mm256_loadu_pd(x+i+4);
+    YMM0 = _mm256_mul_pd(YMM0, YMM15);
+    YMM1 = _mm256_mul_pd(YMM1, YMM15);
+    _mm256_storeu_pd(y+i, YMM0);
+    _mm256_storeu_pd(y+i+4, YMM1);
+  }
+  for (; i<n; i++) {
+    y[i] = x[i] * c;
+  }
+}
+
+void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+  __m256d YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-4); i+=4) {
+    YMM0 = _mm256_loadu_pd(y+i);
+    YMM1 = _mm256_loadu_pd(x+i);
+    YMM2 = _mm256_mul_pd(YMM0, YMM15);
+    YMM3 = _mm256_add_pd(YMM1, YMM2);
+    _mm256_storeu_pd(z+i, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] + y[i] * c;
+  }
+}
+
+void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+  __m256d YMM0, YMM1;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(x+i);
+    YMM1 = _mm256_loadu_pd(x+i+4);
+    YMM0 = _mm256_add_pd(YMM0, YMM15);
+    YMM1 = _mm256_add_pd(YMM1, YMM15);
+    _mm256_storeu_pd(y+i, YMM0);
+    _mm256_storeu_pd(y+i+4, YMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] + c;
+  }
+}
+
+void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n) {
+  ptrdiff_t i;
+  ptrdiff_t off;
+  for (i=0; i<=((n)-16); i+=16) {
+    _mm256_storeu_ps(y+i, _mm256_loadu_ps(x+i));
+    _mm256_storeu_ps(y+i+8, _mm256_loadu_ps(x+i+8));
+  }
+  off = (n) - ((n)%16);
+  for (i=0; i<((n)%16); i++) {
+    y[off+i] = x[off+i];
+  }
+}
+
+void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  ptrdiff_t off;
+  __m256 YMM0 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  for (i=0; i<=((n)-32); i+=32) {
+    _mm256_storeu_ps((x)+i  , YMM0);
+    _mm256_storeu_ps((x)+i+8, YMM0);
+    _mm256_storeu_ps((x)+i+16, YMM0);
+    _mm256_storeu_ps((x)+i+24, YMM0);
+  }
+  off = (n) - ((n)%32);
+  for (i=0; i<((n)%32); i++) {
+    x[off+i] = c;
+  }
+}
+
+void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(x+i);
+    YMM1 = _mm256_loadu_ps(x+i+8);
+    YMM2 = _mm256_loadu_ps(y+i);
+    YMM3 = _mm256_loadu_ps(y+i+8);
+    YMM2 = _mm256_div_ps(YMM0, YMM2);
+    YMM3 = _mm256_div_ps(YMM1, YMM3);
+    _mm256_storeu_ps(z+i, YMM2);
+    _mm256_storeu_ps(z+i+8, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
+void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  __m256 YMM0, YMM1;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(x+i);
+    YMM1 = _mm256_loadu_ps(x+i+8);
+    YMM0 = _mm256_div_ps(YMM0, YMM15);
+    YMM1 = _mm256_div_ps(YMM1, YMM15);
+    _mm256_storeu_ps(y+i, YMM0);
+    _mm256_storeu_ps(y+i+8, YMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] / c;
+  }
+}
+
+void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(x+i);
+    YMM1 = _mm256_loadu_ps(x+i+8);
+    YMM2 = _mm256_loadu_ps(y+i);
+    YMM3 = _mm256_loadu_ps(y+i+8);
+    YMM2 = _mm256_mul_ps(YMM0, YMM2);
+    YMM3 = _mm256_mul_ps(YMM1, YMM3);
+    _mm256_storeu_ps(z+i, YMM2);
+    _mm256_storeu_ps(z+i+8, YMM3);
+  }
+  for (; i<n; i++) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  __m256 YMM0, YMM1;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(x+i);
+    YMM1 = _mm256_loadu_ps(x+i+8);
+    YMM0 = _mm256_mul_ps(YMM0, YMM15);
+    YMM1 = _mm256_mul_ps(YMM1, YMM15);
+    _mm256_storeu_ps(y+i, YMM0);
+    _mm256_storeu_ps(y+i+8, YMM1);
+  }
+  for (; i<n; i++) {
+    y[i] = x[i] * c;
+  }
+}
+
+void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  __m256 YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_ps(y+i);
+    YMM1 = _mm256_loadu_ps(x+i);
+    YMM2 = _mm256_mul_ps(YMM0, YMM15);
+    YMM3 = _mm256_add_ps(YMM1, YMM2);
+    _mm256_storeu_ps(z+i, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] + y[i] * c;
+  }
+}
+
+void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  __m256 YMM0, YMM1;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(x+i);
+    YMM1 = _mm256_loadu_ps(x+i+8);
+    YMM0 = _mm256_add_ps(YMM0, YMM15);
+    YMM1 = _mm256_add_ps(YMM1, YMM15);
+    _mm256_storeu_ps(y+i, YMM0);
+    _mm256_storeu_ps(y+i+8, YMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] + c;
+  }
+}
+
+#endif // defined(__AVX__)
diff --git a/lib/TH/vector/AVX.h b/lib/TH/vector/AVX.h
new file mode 100644
index 0000000..bfaeaa6
--- /dev/null
+++ b/lib/TH/vector/AVX.h
@@ -0,0 +1,23 @@
+#ifndef TH_AVX_H
+#define TH_AVX_H
+
+#include <stddef.h>
+
+void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
+void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n);
+void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n);
+void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n);
+void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+
+#endif
diff --git a/lib/TH/vector/AVX2.c b/lib/TH/vector/AVX2.c
new file mode 100644
index 0000000..082a680
--- /dev/null
+++ b/lib/TH/vector/AVX2.c
@@ -0,0 +1,47 @@
+#if defined(__AVX2__)
+#ifndef _MSC_VER
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif
+#include "AVX2.h"
+
+void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+  __m256d YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-8); i+=8) {
+    YMM0 = _mm256_loadu_pd(y+i);
+    YMM1 = _mm256_loadu_pd(y+i+4);
+    YMM2 = _mm256_loadu_pd(x+i);
+    YMM3 = _mm256_loadu_pd(x+i+4);
+    YMM2 = _mm256_fmadd_pd(YMM0, YMM15, YMM2);
+    YMM3 = _mm256_fmadd_pd(YMM1, YMM15, YMM3);
+    _mm256_storeu_pd(z+i, YMM2);
+    _mm256_storeu_pd(z+i+4, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] + y[i] * c;
+  }
+}
+
+void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+  __m256 YMM0, YMM1, YMM2, YMM3;
+  for (i=0; i<=((n)-16); i+=16) {
+    YMM0 = _mm256_loadu_ps(y+i);
+    YMM1 = _mm256_loadu_ps(y+i+8);
+    YMM2 = _mm256_loadu_ps(x+i);
+    YMM3 = _mm256_loadu_ps(x+i+8);
+    YMM2 = _mm256_fmadd_ps(YMM0, YMM15, YMM2);
+    YMM3 = _mm256_fmadd_ps(YMM1, YMM15, YMM3);
+    _mm256_storeu_ps(z+i, YMM2);
+    _mm256_storeu_ps(z+i+8, YMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] + y[i] * c;
+  }
+}
+
+#endif // defined(__AVX2__)
diff --git a/lib/TH/vector/AVX2.h b/lib/TH/vector/AVX2.h
new file mode 100644
index 0000000..85a9e93
--- /dev/null
+++ b/lib/TH/vector/AVX2.h
@@ -0,0 +1,9 @@
+#ifndef TH_AVX2_H
+#define TH_AVX2_H
+
+#include <stddef.h>
+
+void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+
+#endif
diff --git a/lib/TH/vector/NEON.c b/lib/TH/vector/NEON.c
index 327b006..7920fb1 100644
--- a/lib/TH/vector/NEON.c
+++ b/lib/TH/vector/NEON.c
@@ -14,65 +14,92 @@ static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n)
 
 }
 
-
-static void THFloatVector_diff_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
+static void THFloatVector_cmul_NEON(float *z, const float *x, const float* y, const ptrdiff_t n) {
   long i = 0;
 
   for(; i < n-4; i += 4)
   {
-    z[i] = x[i] - y[i];
-    z[i+1] = x[i+1] - y[i+1];
-    z[i+2] = x[i+2] - y[i+2];
-    z[i+3] = x[i+3] - y[i+3];
+    z[i] = x[i] * y[i];
+    z[i+1] = x[i+1] * y[i+1];
+    z[i+2] = x[i+2] * y[i+2];
+    z[i+3] = x[i+3] * y[i+3];
   }
 
   for(; i < n; i++)
-    z[i] = x[i] - y[i];
+    z[i] = x[i] * y[i];
+}
+
+static void THFloatVector_muls_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
+  long i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    y[i] = x[i] * c;
+    y[i+1] = x[i+1] * c;
+    y[i+2] = x[i+2] * c;
+    y[i+3] = x[i+3] * c;
+  }
 
+  for(; i < n; i++)
+    y[i] = x[i] * c;
 }
 
+static void THFloatVector_cadd_NEON(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+  long i = 0;
+
+  for(;i < n-4; i += 4)
+  {
+    z[i] = x[i] + c * y[i];
+    z[i+1] = x[i+1] + c * y[i+1];
+    z[i+2] = x[i+2] + c * y[i+2];
+    z[i+3] = x[i+3] + c * y[i+3];
+  }
+
+  for(; i < n; i++)
+    z[i] = x[i] + c * y[i];
+}
 
-static void THFloatVector_scale_NEON(float *y, const float c, const ptrdiff_t n) {
+static void THFloatVector_adds_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
   long i = 0;
 
-  for(; i < n-4; i +=4)
+  for(;i < n-4; i += 4)
   {
-    y[i] *= c;
-    y[i+1] *= c;
-    y[i+2] *= c;
-    y[i+3] *= c;
+    y[i] = x[i] + c;
+    y[i+1] = x[i+1] + c;
+    y[i+2] = x[i+2] + c;
+    y[i+3] = x[i+3] + c;
   }
 
   for(; i < n; i++)
-    y[i] *= c;
+    y[i] = x[i] + c;
 }
 
-static void THFloatVector_mul_NEON(float *y, const float *x, const ptrdiff_t n) {
+static void THFloatVector_cdiv_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
   long i = 0;
 
-  for(; i < n-4; i += 4)
+  for(;i < n-4; i += 4)
   {
-    y[i] *= x[i];
-    y[i+1] *= x[i+1];
-    y[i+2] *= x[i+2];
-    y[i+3] *= x[i+3];
+    z[i] = x[i] / y[i];
+    z[i+1] = x[i+1] / y[i+1];
+    z[i+2] = x[i+2] / y[i+2];
+    z[i+3] = x[i+3] / y[i+3];
   }
 
   for(; i < n; i++)
-    y[i] *= x[i];
+    z[i] = x[i] / y[i];
 }
 
-static void THFloatVector_add_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
+static void THFloatVector_divs_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
   long i = 0;
 
   for(;i < n-4; i += 4)
   {
-    y[i] += c * x[i];
-    y[i+1] += c * x[i+1];
-    y[i+2] += c * x[i+2];
-    y[i+3] += c * x[i+3];
+    y[i] = x[i] / c;
+    y[i+1] = x[i+1] / c;
+    y[i+2] = x[i+2] / c;
+    y[i+3] = x[i+3] / c;
   }
 
   for(; i < n; i++)
-    y[i] += c * x[i];
+    y[i] = x[i] / c;
 }
diff --git a/lib/TH/vector/SSE.c b/lib/TH/vector/SSE.c
index 781b037..d026935 100644
--- a/lib/TH/vector/SSE.c
+++ b/lib/TH/vector/SSE.c
@@ -4,7 +4,6 @@
 #include <intrin.h>
 #endif
 
-
 static void THDoubleVector_fill_SSE(double *x, const double c, const ptrdiff_t n) {
   ptrdiff_t i;
   ptrdiff_t off;
@@ -21,70 +20,40 @@ static void THDoubleVector_fill_SSE(double *x, const double c, const ptrdiff_t n
   }
 }
 
-
-static void THDoubleVector_add_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i = 0;
+static void THDoubleVector_cadd_SSE(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
   __m128d XMM7 = _mm_set1_pd(c);
-  __m128d XMM0,XMM2;
-  for (; i<=((n)-2); i+=2) {
+  __m128d XMM0, XMM2;
+  for (i=0; i<=((n)-2); i+=2) {
     XMM0 = _mm_loadu_pd((x)+i);
     XMM2 = _mm_loadu_pd((y)+i);
-    XMM0 = _mm_mul_pd(XMM0, XMM7);
-    XMM2 = _mm_add_pd(XMM2, XMM0);
-    _mm_storeu_pd((y)+i  , XMM2);
+    XMM2 = _mm_mul_pd(XMM2, XMM7);
+    XMM2 = _mm_add_pd(XMM0, XMM2);
+    _mm_storeu_pd((z)+i, XMM2);
   }
   for (; i<(n); i++) {
-    y[i] += c * x[i];
-  }
-}
-
-
-static void THDoubleVector_diff_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  for (i=0; i<=((n)-8); i+=8) {
-    __m128d XMM0 = _mm_loadu_pd((x)+i  );
-    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
-    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
-    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
-    __m128d XMM4 = _mm_loadu_pd((y)+i  );
-    __m128d XMM5 = _mm_loadu_pd((y)+i+2);
-    __m128d XMM6 = _mm_loadu_pd((y)+i+4);
-    __m128d XMM7 = _mm_loadu_pd((y)+i+6);
-    XMM0 = _mm_sub_pd(XMM0, XMM4);
-    XMM1 = _mm_sub_pd(XMM1, XMM5);
-    XMM2 = _mm_sub_pd(XMM2, XMM6);
-    XMM3 = _mm_sub_pd(XMM3, XMM7);
-    _mm_storeu_pd((z)+i  , XMM0);
-    _mm_storeu_pd((z)+i+2, XMM1);
-    _mm_storeu_pd((z)+i+4, XMM2);
-    _mm_storeu_pd((z)+i+6, XMM3);
-  }
-  ptrdiff_t off = (n) - ((n)%8);
-  for (i=0; i<((n)%8); i++) {
-    z[off+i] = x[off+i] - y[off+i];
+    z[i] = x[i] + c * y[i];
   }
 }
 
-
-static void THDoubleVector_scale_SSE(double *y, const double c, const ptrdiff_t n) {
+static void THDoubleVector_adds_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
   ptrdiff_t i;
   __m128d XMM7 = _mm_set1_pd(c);
+  __m128d XMM0, XMM2;
   for (i=0; i<=((n)-4); i+=4) {
-    __m128d XMM0 = _mm_loadu_pd((y)+i  );
-    __m128d XMM1 = _mm_loadu_pd((y)+i+2);
-    XMM0 = _mm_mul_pd(XMM0, XMM7);
-    XMM1 = _mm_mul_pd(XMM1, XMM7);
-    _mm_storeu_pd((y)+i  , XMM0);
-    _mm_storeu_pd((y)+i+2, XMM1);
+    XMM0 = _mm_loadu_pd((x)+i);
+    XMM2 = _mm_loadu_pd((x)+i+2);
+    XMM0 = _mm_add_pd(XMM0, XMM7);
+    XMM2 = _mm_add_pd(XMM2, XMM7);
+    _mm_storeu_pd((y)+i, XMM0);
+    _mm_storeu_pd((y)+i+2, XMM2);
   }
-  ptrdiff_t off = (n) - ((n)%4);
-  for (i=0; i<((n)%4); i++) {
-    y[off+i] *= c;
+  for (; i<(n); i++) {
+    y[i] = x[i] + c;
   }
 }
 
-
-static void THDoubleVector_mul_SSE(double *y, const double *x, const ptrdiff_t n) {
+static void THDoubleVector_cmul_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
   ptrdiff_t i;
   for (i=0; i<=((n)-8); i+=8) {
     __m128d XMM0 = _mm_loadu_pd((x)+i  );
@@ -99,17 +68,72 @@ static void THDoubleVector_mul_SSE(double *y, const double *x, const ptrdiff_t n
     XMM5 = _mm_mul_pd(XMM5, XMM1);
     XMM6 = _mm_mul_pd(XMM6, XMM2);
     XMM7 = _mm_mul_pd(XMM7, XMM3);
+    _mm_storeu_pd((z)+i  , XMM4);
+    _mm_storeu_pd((z)+i+2, XMM5);
+    _mm_storeu_pd((z)+i+4, XMM6);
+    _mm_storeu_pd((z)+i+6, XMM7);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+static void THDoubleVector_muls_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128d XMM15 = _mm_set1_pd(c);
+  for (i=0; i<=((n)-8); i+=8) {
+    __m128d XMM0 = _mm_loadu_pd((x)+i  );
+    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
+    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
+    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
+    __m128d XMM4 = _mm_mul_pd(XMM15, XMM0);
+    __m128d XMM5 = _mm_mul_pd(XMM15, XMM1);
+    __m128d XMM6 = _mm_mul_pd(XMM15, XMM2);
+    __m128d XMM7 = _mm_mul_pd(XMM15, XMM3);
     _mm_storeu_pd((y)+i  , XMM4);
     _mm_storeu_pd((y)+i+2, XMM5);
     _mm_storeu_pd((y)+i+4, XMM6);
     _mm_storeu_pd((y)+i+6, XMM7);
   }
-  ptrdiff_t off = (n) - ((n)%8);
-  for (i=0; i<((n)%8); i++) {
-    y[off+i] *= x[off+i];
+  for (; i<(n); i++) {
+    y[i] = x[i] * c;
+  }
+}
+
+static void THDoubleVector_cdiv_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128d XMM0, XMM1, XMM2, XMM3;
+  for (i=0; i<=((n)-4); i+=4) {
+    XMM0 = _mm_loadu_pd(x+i);
+    XMM1 = _mm_loadu_pd(x+i+2);
+    XMM2 = _mm_loadu_pd(y+i);
+    XMM3 = _mm_loadu_pd(y+i+2);
+    XMM2 = _mm_div_pd(XMM0, XMM2);
+    XMM3 = _mm_div_pd(XMM1, XMM3);
+    _mm_storeu_pd(z+i, XMM2);
+    _mm_storeu_pd(z+i+2, XMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] / y[i];
   }
 }
 
+static void THDoubleVector_divs_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128d XMM7 = _mm_set1_pd(c);
+  __m128d XMM0, XMM1;
+  for (i=0; i<=((n)-4); i+=4) {
+    XMM0 = _mm_loadu_pd(x+i);
+    XMM1 = _mm_loadu_pd(x+i+2);
+    XMM0 = _mm_div_pd(XMM0, XMM7);
+    XMM1 = _mm_div_pd(XMM1, XMM7);
+    _mm_storeu_pd(y+i, XMM0);
+    _mm_storeu_pd(y+i+2, XMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] / c;
+  }
+}
 
 static void THFloatVector_fill_SSE(float *x, const float c, const ptrdiff_t n) {
   ptrdiff_t i;
@@ -128,24 +152,40 @@ static void THFloatVector_fill_SSE(float *x, const float c, const ptrdiff_t n) {
 }
 
 
-static void THFloatVector_add_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i = 0;
+static void THFloatVector_cadd_SSE(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
   __m128 XMM7 = _mm_set_ps1(c);
-  __m128 XMM0,XMM2;
-  for (; i<=((n)-4); i+=4) {
+  __m128 XMM0, XMM2;
+  for (i=0; i<=((n)-4); i+=4) {
     XMM0 = _mm_loadu_ps((x)+i);
     XMM2 = _mm_loadu_ps((y)+i);
-    XMM0 = _mm_mul_ps(XMM0, XMM7);
-    XMM2 = _mm_add_ps(XMM2, XMM0);
-    _mm_storeu_ps((y)+i  , XMM2);
+    XMM2 = _mm_mul_ps(XMM2, XMM7);
+    XMM2 = _mm_add_ps(XMM0, XMM2);
+    _mm_storeu_ps((z)+i, XMM2);
   }
   for (; i<(n); i++) {
-    y[i] += c * x[i];
+    z[i] = x[i] + c * y[i];
   }
 }
 
+static void THFloatVector_adds_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128 XMM7 = _mm_set1_ps(c);
+  __m128 XMM0, XMM2;
+  for (i=0; i<=((n)-8); i+=8) {
+    XMM0 = _mm_loadu_ps((x)+i);
+    XMM2 = _mm_loadu_ps((x)+i+4);
+    XMM0 = _mm_add_ps(XMM0, XMM7);
+    XMM2 = _mm_add_ps(XMM2, XMM7);
+    _mm_storeu_ps((y)+i, XMM0);
+    _mm_storeu_ps((y)+i+4, XMM2);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] + c;
+  }
+}
 
-static void THFloatVector_diff_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
+static void THFloatVector_cmul_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
   ptrdiff_t i;
   for (i=0; i<=((n)-16); i+=16) {
     __m128 XMM0 = _mm_loadu_ps((x)+i   );
@@ -156,62 +196,73 @@ static void THFloatVector_diff_SSE(float *z, const float *x, const float *y, con
     __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
     __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
     __m128 XMM7 = _mm_loadu_ps((y)+i+12);
-    XMM0 = _mm_sub_ps(XMM0, XMM4);
-    XMM1 = _mm_sub_ps(XMM1, XMM5);
-    XMM2 = _mm_sub_ps(XMM2, XMM6);
-    XMM3 = _mm_sub_ps(XMM3, XMM7);
-    _mm_storeu_ps((z)+i   , XMM0);
-    _mm_storeu_ps((z)+i+ 4, XMM1);
-    _mm_storeu_ps((z)+i+ 8, XMM2);
-    _mm_storeu_ps((z)+i+12, XMM3);
-  }
-  ptrdiff_t off = (n) - ((n)%16);
-  for (i=0; i<((n)%16); i++) {
-    z[off+i] = x[off+i] - y[off+i];
+    XMM4 = _mm_mul_ps(XMM4, XMM0);
+    XMM5 = _mm_mul_ps(XMM5, XMM1);
+    XMM6 = _mm_mul_ps(XMM6, XMM2);
+    XMM7 = _mm_mul_ps(XMM7, XMM3);
+    _mm_storeu_ps((z)+i   , XMM4);
+    _mm_storeu_ps((z)+i+ 4, XMM5);
+    _mm_storeu_ps((z)+i+ 8, XMM6);
+    _mm_storeu_ps((z)+i+12, XMM7);
   }
-}
-
-
-static void THFloatVector_scale_SSE(float *y, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM7 = _mm_set_ps1(c);
-  for (i=0; i<=((n)-8); i+=8) {
-    __m128 XMM0 = _mm_loadu_ps((y)+i  );
-    __m128 XMM1 = _mm_loadu_ps((y)+i+4);
-    XMM0 = _mm_mul_ps(XMM0, XMM7);
-    XMM1 = _mm_mul_ps(XMM1, XMM7);
-    _mm_storeu_ps((y)+i  , XMM0);
-    _mm_storeu_ps((y)+i+4, XMM1);
-  }
-  ptrdiff_t off = (n) - ((n)%8);
-  for (i=0; i<((n)%8); i++) {
-    y[off+i] *= c;
+  for (; i<(n); i++) {
+    z[i] = x[i] * y[i];
   }
 }
 
-
-static void THFloatVector_mul_SSE(float *y, const float *x, const ptrdiff_t n) {
+static void THFloatVector_muls_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
   ptrdiff_t i;
+  __m128 XMM15 = _mm_set_ps1(c);
   for (i=0; i<=((n)-16); i+=16) {
     __m128 XMM0 = _mm_loadu_ps((x)+i   );
     __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
     __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
     __m128 XMM3 = _mm_loadu_ps((x)+i+12);
-    __m128 XMM4 = _mm_loadu_ps((y)+i   );
-    __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
-    __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
-    __m128 XMM7 = _mm_loadu_ps((y)+i+12);
-    XMM4 = _mm_mul_ps(XMM4, XMM0);
-    XMM5 = _mm_mul_ps(XMM5, XMM1);
-    XMM6 = _mm_mul_ps(XMM6, XMM2);
-    XMM7 = _mm_mul_ps(XMM7, XMM3);
+    __m128 XMM4 = _mm_mul_ps(XMM15, XMM0);
+    __m128 XMM5 = _mm_mul_ps(XMM15, XMM1);
+    __m128 XMM6 = _mm_mul_ps(XMM15, XMM2);
+    __m128 XMM7 = _mm_mul_ps(XMM15, XMM3);
     _mm_storeu_ps((y)+i   , XMM4);
     _mm_storeu_ps((y)+i+ 4, XMM5);
     _mm_storeu_ps((y)+i+ 8, XMM6);
     _mm_storeu_ps((y)+i+12, XMM7);
   }
-  ptrdiff_t off = (n) - ((n)%16);
-  for (i=0; i<((n)%16); i++) {
-    y[off+i] *= x[off+i];
+  for (; i<(n); i++) {
+    y[i] = x[i] * c;
+  }
+}
+
+static void THFloatVector_cdiv_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128 XMM0, XMM1, XMM2, XMM3;
+  for (i=0; i<=((n)-8); i+=8) {
+    XMM0 = _mm_loadu_ps(x+i);
+    XMM1 = _mm_loadu_ps(x+i+4);
+    XMM2 = _mm_loadu_ps(y+i);
+    XMM3 = _mm_loadu_ps(y+i+4);
+    XMM2 = _mm_div_ps(XMM0, XMM2);
+    XMM3 = _mm_div_ps(XMM1, XMM3);
+    _mm_storeu_ps(z+i, XMM2);
+    _mm_storeu_ps(z+i+4, XMM3);
+  }
+  for (; i<(n); i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
+static void THFloatVector_divs_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
+  ptrdiff_t i;
+  __m128 XMM7 = _mm_set1_ps(c);
+  __m128 XMM0, XMM1;
+  for (i=0; i<=((n)-8); i+=8) {
+    XMM0 = _mm_loadu_ps(x+i);
+    XMM1 = _mm_loadu_ps(x+i+4);
+    XMM0 = _mm_div_ps(XMM0, XMM7);
+    XMM1 = _mm_div_ps(XMM1, XMM7);
+    _mm_storeu_ps(y+i, XMM0);
+    _mm_storeu_ps(y+i+4, XMM1);
+  }
+  for (; i<(n); i++) {
+    y[i] = x[i] / c;
   }
 }
diff --git a/test/test.lua b/test/test.lua
index e7e26e4..6221854 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -112,7 +112,7 @@ local genericSingleOpTest = [[
       end
    end
    return maxerrc, maxerrnc
-]]
+--]]
 
 function torchtest.sin()
    local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'sin'))
@@ -343,6 +343,12 @@ function torchtest.round()
 end
 
 function torchtest.max()  -- torch.max([resval, resind,] x [,dim])
+
+   -- TH_TENSOR_BASE
+   local m1 = torch.Tensor(8,2):fill(3):select(2, 1)
+   local resval, resind = torch.max(m1, 1)
+   mytester:assert(resind[1] == 1)
+
    -- torch.max( x )
    -- contiguous
    local m1 = torch.randn(100,100)
@@ -357,6 +363,7 @@ function torchtest.max()  -- torch.max([resval, resind,] x [,dim])
    end
    local err = res1 - res2
    mytester:assertlt(err, precision, 'error in torch.max - contiguous')
+
    -- non-contiguous
    local m1 = torch.randn(10,10,10)
    local m2 = m1[{{}, 4, {}}]
@@ -371,33 +378,34 @@ function torchtest.max()  -- torch.max([resval, resind,] x [,dim])
    end
    local err = res1 - res2
    mytester:assertlt(err, precision, 'error in torch.max - non-contiguous')
+
    -- torch.max([resval, resind,] x ,dim])
-   local m1 = torch.randn(100,100)
-   local res1val, res1ind = torch.max(m1, 2)
-   local res2val = res1val:clone():zero()
-   local res2ind = res1ind:clone():zero()
-   for i=1, m1:size(1) do
-      res2val[i] = m1[i][1]
-      res2ind[i] = 1
-      for j=1, m1:size(2) do
-         if m1[i][j] > res2val[i][1] then
-            res2val[i] = m1[i][j]
-            res2ind[i] = j
+   function lua_max(t, dim)
+      assert(t:nDimension() == 2)
+      max_val = t:narrow(dim, 1, 1):clone()
+      max_ind = t:narrow(dim, 1, 1):clone():long():fill(1)
+      other = 3 - dim
+      for i = 1, t:size(other) do
+         for j = 1, t:size(dim) do
+            val = t:select(other, i):select(dim, j)
+            max = max_val:select(other, i):select(dim, 1)
+            if val > max then
+               max_val:select(other, i):fill(val)
+               max_ind:select(other, i):fill(j)
+            end
          end
       end
+      return max_val, max_ind
    end
-   local errval = res1val:clone():zero()
-   for i = 1, res1val:size(1) do
-      errval[i] = math.abs(res1val[i][1] - res2val[i][1])
-      mytester:asserteq(res1ind[i][1], res2ind[i][1], 'error in torch.max - non-contiguous')
-   end
-   local maxerr = 0
-   for i = 1, errval:size(1) do
-      if errval[i][1] > maxerr then
-         maxerr = errval[i]
-      end
+
+   local m1 = torch.randn(100,100)
+   for dim = 1,2 do
+      local res1val, res1ind = torch.max(m1, dim)
+      local res2val, res2ind = lua_max(m1, dim)
+      mytester:asserteq((res1val-res2val):abs():max(), 0, 'error in torch.max')
+      mytester:asserteq((res1ind-res2ind):abs():max(), 0, 'error in torch.max')
    end
-   mytester:assertlt(maxerr, precision, 'error in torch.max - non-contiguous')
+
    -- NaNs
    for index in pairs{1, 5, 100} do
       local m1 = torch.randn(100)
@@ -439,33 +447,34 @@ function torchtest.min()  -- torch.min([resval, resind,] x [,dim])
    end
    local err = res1 - res2
    mytester:assertlt(err, precision, 'error in torch.min - non-contiguous')
-   -- torch.min([resval, resind,] x ,dim])
-   local m1 = torch.randn(100,100)
-   local res1val, res1ind = torch.min(m1, 2)
-   local res2val = res1val:clone():zero()
-   local res2ind = res1ind:clone():zero()
-   for i=1, m1:size(1) do
-      res2val[i] = m1[i][1]
-      res2ind[i] = 1
-      for j=1, m1:size(2) do
-         if m1[i][j] < res2val[i][1] then
-            res2val[i] = m1[i][j]
-            res2ind[i] = j
+
+   -- torch.max([resval, resind,] x ,dim])
+   function lua_min(t, dim)
+      assert(t:nDimension() == 2)
+      max_val = t:narrow(dim, 1, 1):clone()
+      max_ind = t:narrow(dim, 1, 1):clone():long():fill(1)
+      other = 3 - dim
+      for i = 1, t:size(other) do
+         for j = 1, t:size(dim) do
+            val = t:select(other, i):select(dim, j)
+            max = max_val:select(other, i):select(dim, 1)
+            if val < max then
+               max_val:select(other, i):fill(val)
+               max_ind:select(other, i):fill(j)
+            end
          end
       end
+      return max_val, max_ind
    end
-   local errval = res1val:clone():zero()
-   for i = 1, res1val:size(1) do
-      errval[i] = math.abs(res1val[i][1] - res2val[i][1])
-      mytester:asserteq(res1ind[i][1], res2ind[i][1], 'error in torch.min - non-contiguous')
-   end
-   local minerr = 0
-   for i = 1, errval:size(1) do
-      if errval[i][1] < minerr then
-         minerr = errval[i]
-      end
+
+   local m1 = torch.randn(100,100)
+   for dim = 1,2 do
+      local res1val, res1ind = torch.min(m1, dim)
+      local res2val, res2ind = lua_min(m1, dim)
+      mytester:asserteq((res1val-res2val):abs():max(), 0, 'error in torch.max')
+      mytester:asserteq((res1ind-res2ind):abs():max(), 0, 'error in torch.max')
    end
-   mytester:assertlt(minerr, precision, 'error in torch.min - non-contiguous')
+
    -- NaNs
    for index in pairs{1, 5, 100} do
       local m1 = torch.randn(100)
@@ -476,6 +485,11 @@ function torchtest.min()  -- torch.min([resval, resind,] x [,dim])
       local res1val = torch.min(m1)
       mytester:assert(res1val ~= res1val, 'error in torch.min - NaNs')
    end
+
+   -- TH_TENSOR_BASE
+   local m1 = torch.Tensor(4):fill(3)
+   local resval, resind = torch.min(m1, 1)
+   mytester:assert(resind[1] == 1)
 end
 
 function torchtest.cmax()
@@ -574,64 +588,117 @@ function torchtest.mv()
    mytester:assertlt(err, precision, 'error in torch.mv')
 end
 
-function torchtest.add()
-   -- [res] torch.add([res,] tensor1, tensor2)
-   local m1 = torch.randn(100,100)
-   local v1 = torch.randn(100)
+function torchtest.fill()
+   local types = {
+      'torch.ByteTensor',
+      'torch.CharTensor',
+      'torch.ShortTensor',
+      'torch.IntTensor',
+      'torch.FloatTensor',
+      'torch.DoubleTensor',
+      'torch.LongTensor',
+   }
 
-   local res1 = torch.add(m1[{ 4,{} }],v1)
+   for k,t in ipairs(types) do
+      -- [res] torch.fill([res,] tensor, value)
+      local m1 = torch.ones(100,100):type(t)
+      local res1 = m1:clone()
+      res1[{ 3,{} }]:fill(2)
+
+      local res2 = m1:clone()
+      for i = 1,m1:size(1) do
+	 res2[{ 3,i }] = 2
+      end
 
-   local res2 = res1:clone():zero()
-   for i = 1,m1:size(2) do
-      res2[i] = m1[4][i] + v1[i]
-   end
+      local err = (res1-res2):double():abs():max()
 
-   local err = (res1-res2):abs():max()
+      mytester:assertlt(err, precision, 'error in torch.fill - contiguous')
 
-   mytester:assertlt(err, precision, 'error in torch.add - contiguous')
+      local m1 = torch.ones(100,100):type(t)
+      local res1 = m1:clone()
+      res1[{ {},3 }]:fill(2)
 
-   local m1 = torch.randn(100,100)
-   local v1 = torch.randn(100)
+      local res2 = m1:clone()
+      for i = 1,m1:size(1) do
+	 res2[{ i,3 }] = 2
+      end
 
-   local res1 = torch.add(m1[{ {},4 }],v1)
+      local err = (res1-res2):double():abs():max()
 
-   local res2 = res1:clone():zero()
-   for i = 1,m1:size(1) do
-      res2[i] = m1[i][4] + v1[i]
+      mytester:assertlt(err, precision, 'error in torch.fill - non contiguous')
    end
+end
 
-   local err = (res1-res2):abs():max()
+function torchtest.add()
+   local types = {
+      'torch.ByteTensor',
+      'torch.CharTensor',
+      'torch.ShortTensor',
+      'torch.IntTensor',
+      'torch.FloatTensor',
+      'torch.DoubleTensor',
+      'torch.LongTensor',
+   }
 
-   mytester:assertlt(err, precision, 'error in torch.add - non contiguous')
+   for k,t in ipairs(types) do
+       -- [res] torch.add([res,] tensor1, tensor2)
+       local m1 = torch.randn(100,100):type(t)
+       local v1 = torch.randn(100):type(t)
 
-   -- [res] torch.add([res,] tensor, value)
-   local m1 = torch.randn(10,10)
-   local res1 = m1:clone()
-   res1[{ 3,{} }]:add(2)
+       local res1 = torch.add(m1[{ 4,{} }],v1)
 
-   local res2 = m1:clone()
-   for i = 1,m1:size(1) do
-      res2[{ 3,i }] = res2[{ 3,i }] + 2
-   end
+       local res2 = res1:clone():zero()
+       for i = 1,m1:size(2) do
+           res2[i] = m1[4][i] + v1[i]
+       end
 
-   local err = (res1-res2):abs():max()
+       local err = (res1-res2):double():abs():max()
 
-   mytester:assertlt(err, precision, 'error in torch.add - scalar, contiguous')
+       mytester:assertlt(err, precision, 'error in torch.add - contiguous' .. ' ' .. t)
 
-   local m1 = torch.randn(10,10)
-   local res1 = m1:clone()
-   res1[{ {},3 }]:add(2)
+       local m1 = torch.randn(100,100):type(t)
+       local v1 = torch.randn(100):type(t)
 
-   local res2 = m1:clone()
-   for i = 1,m1:size(1) do
-      res2[{ i,3 }] = res2[{ i,3 }] + 2
-   end
+       local res1 = torch.add(m1[{ {},4 }],v1)
 
-   local err = (res1-res2):abs():max()
+       local res2 = res1:clone():zero()
+       for i = 1,m1:size(1) do
+           res2[i] = m1[i][4] + v1[i]
+       end
+
+       local err = (res1-res2):double():abs():max()
+
+       mytester:assertlt(err, precision, 'error in torch.add - non contiguous' .. ' ' .. t)
+
+       -- [res] torch.add([res,] tensor, value)
+       local m1 = torch.randn(10,10):type(t)
+       local res1 = m1:clone()
+       res1[{ 3,{} }]:add(2)
+
+       local res2 = m1:clone()
+       for i = 1,m1:size(1) do
+           res2[{ 3,i }] = res2[{ 3,i }] + 2
+       end
+
+       local err = (res1-res2):double():abs():max()
+
+       mytester:assertlt(err, precision, 'error in torch.add - scalar, contiguous' .. ' ' .. t)
+
+       local m1 = torch.randn(10,10)
+       local res1 = m1:clone()
+       res1[{ {},3 }]:add(2)
+
+       local res2 = m1:clone()
+       for i = 1,m1:size(1) do
+           res2[{ i,3 }] = res2[{ i,3 }] + 2
+       end
+
+       local err = (res1-res2):abs():max()
 
-   mytester:assertlt(err, precision, 'error in torch.add - scalar, non contiguous')
+       mytester:assertlt(err, precision, 'error in torch.add - scalar, non contiguous' .. ' ' .. t)
 
-   -- [res] torch.add([res,] tensor1, value, tensor2)
+       -- [res] torch.add([res,] tensor1, value, tensor2)
+   end
 end
 
 function torchtest.csub()
@@ -699,35 +766,130 @@ function torchtest.cinv()
 end
 
 function torchtest.mul()
-   local m1 = torch.randn(10,10)
+   local types = {
+      'torch.ByteTensor',
+      'torch.CharTensor',
+      'torch.ShortTensor',
+      'torch.IntTensor',
+      'torch.FloatTensor',
+      'torch.DoubleTensor',
+      'torch.LongTensor',
+   }
+
+   for k,t in ipairs(types) do
+       local m1 = torch.randn(10,10):type(t)
+       local res1 = m1:clone()
+
+       res1[{ {},3 }]:mul(2)
+
+       local res2 = m1:clone()
+       for i = 1,m1:size(1) do
+           res2[{ i,3 }] = res2[{ i,3 }] * 2
+       end
+
+       local err = (res1-res2):double():abs():max()
+
+       mytester:assertlt(err, precision, 'error in torch.mul - scalar, non contiguous' .. ' ' .. t)
+   end
+end
+
+function torchtest.div()
+    local types = {
+        'torch.ByteTensor',
+        'torch.CharTensor',
+        'torch.ShortTensor',
+        'torch.IntTensor',
+        'torch.FloatTensor',
+        'torch.DoubleTensor',
+        'torch.LongTensor',
+    }
+
+    for k,t in ipairs(types) do
+
+        local m1 = torch.randn(10,10):type(t)
+        local res1 = m1:clone()
+
+        res1[{ {},3 }]:div(2)
+
+        local res2 = m1:clone()
+        for i = 1,m1:size(1) do
+            res2[{ i,3 }] = res2[{ i,3 }] / 2
+        end
+
+        local err = (res1-res2):double():abs():max()
+
+        mytester:assertlt(err, precision, 'error in torch.div - scalar, non contiguous' .. ' ' .. t)
+    end
+end
+
+function torchtest.lshift()
+   local m1 = torch.LongTensor(10,10):random(0,100)
+   local res1 = m1:clone()
+
+   local q = 2
+   local f = math.pow(2, q)
+   res1[{ {},3 }]:lshift(q)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      res2[{ i,3 }] = res2[{ i,3 }] * f
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.lshift - scalar, non contiguous')
+
+   local m1 = torch.LongTensor(10,10):random(0,100)
    local res1 = m1:clone()
 
-   res1[{ {},3 }]:mul(2)
+   local q = 2
+   res1:lshift(q)
 
    local res2 = m1:clone()
    for i = 1,m1:size(1) do
-      res2[{ i,3 }] = res2[{ i,3 }] * 2
+      for j = 1,m1:size(1) do
+         res2[{ i,j }] = res2[{ i,j }] * f
+      end
    end
 
    local err = (res1-res2):abs():max()
 
-   mytester:assertlt(err, precision, 'error in torch.mul - scalar, non contiguous')
+   mytester:assertlt(err, precision, 'error in torch.lshift - scalar, contiguous')
 end
 
-function torchtest.div()
-   local m1 = torch.randn(10,10)
+function torchtest.rshift()
+   local m1 = torch.LongTensor(10,10):random(0,100)
+   local res1 = m1:clone()
+
+   local q = 2
+   local f = math.pow(2, q)
+   res1[{ {},3 }]:rshift(q)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      res2[{ i,3 }] = math.floor(res2[{ i,3 }] / f)
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.rshift - scalar, non contiguous')
+
+   local m1 = torch.LongTensor(10,10):random(0,100)
    local res1 = m1:clone()
 
-   res1[{ {},3 }]:div(2)
+   local q = 2
+   res1:rshift(q)
 
    local res2 = m1:clone()
    for i = 1,m1:size(1) do
-      res2[{ i,3 }] = res2[{ i,3 }] / 2
+      for j = 1,m1:size(1) do
+         res2[{ i,j }] = math.floor(res2[{ i,j }] / f)
+      end
    end
 
    local err = (res1-res2):abs():max()
 
-   mytester:assertlt(err, precision, 'error in torch.div - scalar, non contiguous')
+   mytester:assertlt(err, precision, 'error in torch.rshift - scalar, contiguous')
 end
 
 function torchtest.fmod()
@@ -764,6 +926,86 @@ function torchtest.remainder()
    mytester:assertlt(err, precision, 'error in torch.remainder - scalar, non contiguous')
 end
 
+function torchtest.bitand()
+   local m1 = torch.LongTensor(10,10):random(0,100)
+   local res1 = m1:clone()
+
+   local val = 32 -- This should be a power of 2
+   res1[{ {},3 }]:bitand(val - 1)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      res2[{ i,3 }] = res2[{ i,3 }] % val
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.bitand - scalar, non contiguous')
+
+   local m1 = torch.LongTensor(10,10):random(0,100)
+   local res1 = m1:clone()
+
+   res1:bitand(val - 1)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      for j = 1,m1:size(1) do
+         res2[{ i,j }] = res2[{ i,j }] % val
+      end
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.bitand - scalar, contiguous')
+end
+
+function torchtest.bitor()
+   local m1 = torch.LongTensor(10,10):random(0,10000)
+   local res1 = m1:clone()
+
+   local val = 32 -- This should be a power of 2
+   res1[{ {},3 }]:bitor(val-1)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      res2[{ i,3 }] = math.floor(res2[{ i,3 }] / val) * val + (val - 1)
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.bitor - scalar, non contiguous')
+
+   local m1 = torch.LongTensor(10,10):random(0,10000)
+   local res1 = m1:clone()
+
+   res1:bitor(val - 1)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      for j = 1,m1:size(1) do
+         res2[{ i,j }] = math.floor(res2[{ i,j }] / val) * val + (val - 1)
+      end
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.bitor - scalar, contiguous')
+end
+
+function torchtest.cbitxor()
+   local t1 = torch.LongTensor(10,10):random(0,10000)
+   local t2 = torch.LongTensor(10,10):random(10001,20000)
+
+   -- Perform xor swap and check results
+   local t3 = torch.cbitxor(t1, t2)
+   local r1 = torch.cbitxor(t3, t2)
+   local r2 = torch.cbitxor(t3, t1)
+
+   local err1 = (r1 - t1):abs():max()
+   local err2 = (r2 - t2):abs():max()
+   mytester:assertlt(err1 + err2, precision, 'error in torch.cbitxor contiguous')
+end
+
 function torchtest.mm()
    -- helper function
    local function matrixmultiply(mat1,mat2)
@@ -1016,68 +1258,84 @@ function torchtest.pow() -- [res] torch.pow([res,] x)
    mytester:assertlt(maxerr, precision, 'error in torch.pow - non-contiguous')
 end
 
-function torchtest.cdiv()  -- [res] torch.cdiv([res,] tensor1, tensor2)
-   -- contiguous
-   local m1 = torch.randn(10, 10, 10)
-   local m2 = torch.randn(10, 10 * 10)
-   local sm1 = m1[{4, {}, {}}]
-   local sm2 = m2[{4, {}}]
-   local res1 = torch.cdiv(sm1, sm2)
-   local res2 = res1:clone():zero()
-   for i = 1,sm1:size(1) do
-      for j = 1, sm1:size(2) do
-         local idx1d = (((i-1)*sm1:size(1)))+j
-         res2[i][j] = sm1[i][j] / sm2[idx1d]
-      end
-   end
-   local err = res1:clone():zero()
-   -- find absolute error
-   for i = 1, res1:size(1) do
-      for j = 1, res1:size(2) do
-         err[i][j] = math.abs(res1[i][j] - res2[i][j])
-      end
-   end
-   -- find maximum element of error
-   local maxerr = 0
-   for i = 1, err:size(1) do
-      for j = 1, err:size(2) do
-         if err[i][j] > maxerr then
-            maxerr = err[i][j]
-         end
-      end
-   end
-   mytester:assertlt(maxerr, precision, 'error in torch.cdiv - contiguous')
-
-   -- non-contiguous
-   local m1 = torch.randn(10, 10, 10)
-   local m2 = torch.randn(10 * 10, 10 * 10)
-   local sm1 = m1[{{}, 4, {}}]
-   local sm2 = m2[{{}, 4}]
-   local res1 = torch.cdiv(sm1, sm2)
-   local res2 = res1:clone():zero()
-   for i = 1,sm1:size(1) do
-      for j = 1, sm1:size(2) do
-         local idx1d = (((i-1)*sm1:size(1)))+j
-         res2[i][j] = sm1[i][j] / sm2[idx1d]
-      end
-   end
-   local err = res1:clone():zero()
-   -- find absolute error
-   for i = 1, res1:size(1) do
-      for j = 1, res1:size(2) do
-         err[i][j] = math.abs(res1[i][j] - res2[i][j])
-      end
-   end
-   -- find maximum element of error
-   local maxerr = 0
-   for i = 1, err:size(1) do
-      for j = 1, err:size(2) do
-         if err[i][j] > maxerr then
-            maxerr = err[i][j]
-         end
-      end
+function torchtest.cdiv()
+    local types = {
+        'torch.ByteTensor',
+        'torch.CharTensor',
+        'torch.ShortTensor',
+        'torch.IntTensor',
+        'torch.FloatTensor',
+        'torch.DoubleTensor',
+        'torch.LongTensor',
+    }
+
+    for k,t in ipairs(types) do
+
+        -- [res] torch.cdiv([res,] tensor1, tensor2)
+        -- contiguous
+        local m1 = torch.randn(10, 10, 10):type(t)
+        local m2 = torch.randn(10, 10 * 10):type(t)
+        m2[m2:eq(0)] = 2
+        local sm1 = m1[{4, {}, {}}]
+        local sm2 = m2[{4, {}}]
+        local res1 = torch.cdiv(sm1, sm2)
+        local res2 = res1:clone():zero()
+        for i = 1,sm1:size(1) do
+            for j = 1, sm1:size(2) do
+                local idx1d = (((i-1)*sm1:size(1)))+j
+                res2[i][j] = sm1[i][j] / sm2[idx1d]
+            end
+        end
+        local err = res1:clone():zero()
+        -- find absolute error
+        for i = 1, res1:size(1) do
+            for j = 1, res1:size(2) do
+                err[i][j] = math.abs(res1[i][j] - res2[i][j])
+            end
+        end
+        -- find maximum element of error
+        local maxerr = 0
+        for i = 1, err:size(1) do
+            for j = 1, err:size(2) do
+                if err[i][j] > maxerr then
+                    maxerr = err[i][j]
+                end
+            end
+        end
+        mytester:assertlt(maxerr, precision, 'error in torch.cdiv - contiguous' .. ' ' .. t)
+
+        -- non-contiguous
+        local m1 = torch.randn(10, 10, 10):type(t)
+        local m2 = torch.randn(10 * 10, 10 * 10):type(t)
+        m2[m2:eq(0)] = 2
+        local sm1 = m1[{{}, 4, {}}]
+        local sm2 = m2[{{}, 4}]
+        local res1 = torch.cdiv(sm1, sm2)
+        local res2 = res1:clone():zero()
+        for i = 1,sm1:size(1) do
+            for j = 1, sm1:size(2) do
+                local idx1d = (((i-1)*sm1:size(1)))+j
+                res2[i][j] = sm1[i][j] / sm2[idx1d]
+            end
+        end
+        local err = res1:clone():zero()
+        -- find absolute error
+        for i = 1, res1:size(1) do
+            for j = 1, res1:size(2) do
+                err[i][j] = math.abs(res1[i][j] - res2[i][j])
+            end
+        end
+        -- find maximum element of error
+        local maxerr = 0
+        for i = 1, err:size(1) do
+            for j = 1, err:size(2) do
+                if err[i][j] > maxerr then
+                    maxerr = err[i][j]
+                end
+            end
+        end
+        mytester:assertlt(maxerr, precision, 'error in torch.cdiv - non-contiguous' .. ' ' .. t)
    end
-   mytester:assertlt(maxerr, precision, 'error in torch.cdiv - non-contiguous')
 end
 
 function torchtest.cfmod()
@@ -1208,68 +1466,82 @@ function torchtest.cremainder()
    mytester:assertlt(maxerr, precision, 'error in torch.cremainder - non-contiguous')
 end
 
-function torchtest.cmul()  -- [res] torch.cmul([res,] tensor1, tensor2)
-   -- contiguous
-   local m1 = torch.randn(10, 10, 10)
-   local m2 = torch.randn(10, 10 * 10)
-   local sm1 = m1[{4, {}, {}}]
-   local sm2 = m2[{4, {}}]
-   local res1 = torch.cmul(sm1, sm2)
-   local res2 = res1:clone():zero()
-   for i = 1,sm1:size(1) do
-      for j = 1, sm1:size(2) do
-         local idx1d = (((i-1)*sm1:size(1)))+j
-         res2[i][j] = sm1[i][j] * sm2[idx1d]
-      end
-   end
-   local err = res1:clone():zero()
-   -- find absolute error
-   for i = 1, res1:size(1) do
-      for j = 1, res1:size(2) do
-         err[i][j] = math.abs(res1[i][j] - res2[i][j])
-      end
-   end
-   -- find maximum element of error
-   local maxerr = 0
-   for i = 1, err:size(1) do
-      for j = 1, err:size(2) do
-         if err[i][j] > maxerr then
-            maxerr = err[i][j]
-         end
-      end
-   end
-   mytester:assertlt(maxerr, precision, 'error in torch.cmul - contiguous')
-
-   -- non-contiguous
-   local m1 = torch.randn(10, 10, 10)
-   local m2 = torch.randn(10 * 10, 10 * 10)
-   local sm1 = m1[{{}, 4, {}}]
-   local sm2 = m2[{{}, 4}]
-   local res1 = torch.cmul(sm1, sm2)
-   local res2 = res1:clone():zero()
-   for i = 1,sm1:size(1) do
-      for j = 1, sm1:size(2) do
-         local idx1d = (((i-1)*sm1:size(1)))+j
-         res2[i][j] = sm1[i][j] * sm2[idx1d]
-      end
-   end
-   local err = res1:clone():zero()
-   -- find absolute error
-   for i = 1, res1:size(1) do
-      for j = 1, res1:size(2) do
-         err[i][j] = math.abs(res1[i][j] - res2[i][j])
-      end
-   end
-   -- find maximum element of error
-   local maxerr = 0
-   for i = 1, err:size(1) do
-      for j = 1, err:size(2) do
-         if err[i][j] > maxerr then
-            maxerr = err[i][j]
-         end
-      end
-   end
-   mytester:assertlt(maxerr, precision, 'error in torch.cmul - non-contiguous')
+function torchtest.cmul()
+    local types = {
+        'torch.ByteTensor',
+        'torch.CharTensor',
+        'torch.ShortTensor',
+        'torch.IntTensor',
+        'torch.FloatTensor',
+        'torch.DoubleTensor',
+        'torch.LongTensor',
+    }
+
+    for k,t in ipairs(types) do
+
+        -- [res] torch.cmul([res,] tensor1, tensor2)
+        -- contiguous
+        local m1 = torch.randn(10, 10, 10):type(t)
+        local m2 = torch.randn(10, 10 * 10):type(t)
+        local sm1 = m1[{4, {}, {}}]
+        local sm2 = m2[{4, {}}]
+        local res1 = torch.cmul(sm1, sm2)
+        local res2 = res1:clone():zero()
+        for i = 1,sm1:size(1) do
+            for j = 1, sm1:size(2) do
+                local idx1d = (((i-1)*sm1:size(1)))+j
+                res2[i][j] = sm1[i][j] * sm2[idx1d]
+            end
+        end
+        local err = res1:clone():zero()
+        -- find absolute error
+        for i = 1, res1:size(1) do
+            for j = 1, res1:size(2) do
+                err[i][j] = math.abs(res1[i][j] - res2[i][j])
+            end
+        end
+        -- find maximum element of error
+        local maxerr = 0
+        for i = 1, err:size(1) do
+            for j = 1, err:size(2) do
+                if err[i][j] > maxerr then
+                    maxerr = err[i][j]
+                end
+            end
+        end
+        mytester:assertlt(maxerr, precision, 'error in torch.cmul - contiguous' .. ' ' .. t)
+
+        -- non-contiguous
+        local m1 = torch.randn(10, 10, 10):type(t)
+        local m2 = torch.randn(10 * 10, 10 * 10):type(t)
+        local sm1 = m1[{{}, 4, {}}]
+        local sm2 = m2[{{}, 4}]
+        local res1 = torch.cmul(sm1, sm2)
+        local res2 = res1:clone():zero()
+        for i = 1,sm1:size(1) do
+            for j = 1, sm1:size(2) do
+                local idx1d = (((i-1)*sm1:size(1)))+j
+                res2[i][j] = sm1[i][j] * sm2[idx1d]
+            end
+        end
+        local err = res1:clone():zero()
+        -- find absolute error
+        for i = 1, res1:size(1) do
+            for j = 1, res1:size(2) do
+                err[i][j] = math.abs(res1[i][j] - res2[i][j])
+            end
+        end
+        -- find maximum element of error
+        local maxerr = 0
+        for i = 1, err:size(1) do
+            for j = 1, err:size(2) do
+                if err[i][j] > maxerr then
+                    maxerr = err[i][j]
+                end
+            end
+        end
+        mytester:assertlt(maxerr, precision, 'error in torch.cmul - non-contiguous' .. ' ' .. t)
+    end
 end
 
 function torchtest.cpow()  -- [res] torch.cpow([res,] tensor1, tensor2)
@@ -1342,6 +1614,16 @@ function torchtest.sum()
    local mxx = torch.Tensor()
    torch.sum(mxx,x,2)
    mytester:asserteq(maxdiff(mx,mxx),0,'torch.sum value')
+
+   local y = torch.rand(5, 5, 5)
+   for i=1,3 do
+      local a = y:sum(i)
+      local b = y:narrow(i, 1, 1):clone():zero()
+      for j = 1, 5 do
+         b:add(y:narrow(i, j, 1))
+      end
+      mytester:asserteq(maxdiff(a, b), 0, 'torch.sum value')
+   end
 end
 function torchtest.prod()
    local x = torch.rand(msize,msize)
@@ -1349,6 +1631,16 @@ function torchtest.prod()
    local mxx = torch.Tensor()
    torch.prod(mxx,x,2)
    mytester:asserteq(maxdiff(mx,mxx),0,'torch.prod value')
+
+   local y = torch.rand(5, 5, 5)
+   for i=1,3 do
+      local a = y:prod(i)
+      local b = y:narrow(i, 1, 1):clone():fill(1)
+      for j = 1, 5 do
+         b:cmul(y:narrow(i, j, 1))
+      end
+      mytester:asserteq(maxdiff(a, b), 0, 'torch.sum value')
+   end
 end
 function torchtest.cumsum()
    local x = torch.rand(msize,msize)
@@ -1942,6 +2234,29 @@ function torchtest.catArray()
    local mx = torch.cat({x,y})
    mytester:asserteq(mx:dim(),0,'torch.cat dim')
 end
+function torchtest.catNoDim()
+   local a
+   local b
+   local c
+
+   a = torch.Tensor(msize):uniform()
+   b = torch.Tensor(msize):uniform()
+   c = torch.cat(a, b)
+   mytester:assertTensorEq(c:narrow(1, 1, msize), a, 0, 'torch.cat value')
+   mytester:assertTensorEq(c:narrow(1, msize + 1, msize), b, 0, 'torch.cat value')
+
+   a = torch.Tensor(1, msize):uniform()
+   b = torch.Tensor(1, msize):uniform()
+   c = torch.cat(a, b)
+   mytester:assertTensorEq(c:narrow(2, 1, msize), a, 0, 'torch.cat value')
+   mytester:assertTensorEq(c:narrow(2, msize + 1, msize), b, 0, 'torch.cat value')
+
+   a = torch.Tensor(10, msize):uniform()
+   b = torch.Tensor(10, msize):uniform()
+   c = torch.cat(a, b)
+   mytester:assertTensorEq(c:narrow(2, 1, msize), a, 0, 'torch.cat value')
+   mytester:assertTensorEq(c:narrow(2, msize + 1, msize), b, 0, 'torch.cat value')
+end
 function torchtest.sin_2()
    local x = torch.rand(msize,msize,msize)
    local mx = torch.sin(x)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-torch7.git