[lua-torch-torch7] 01/05: New upstream version 0~20170304-g329dff5
Zhou Mo
cdluminate-guest at moszumanska.debian.org
Tue Mar 14 08:37:52 UTC 2017
This is an automated email from the git hooks/post-receive script.
cdluminate-guest pushed a commit to branch master
in repository lua-torch-torch7.
commit cb2297ab010e6e3e759b6a6d580866d99be74407
Author: Zhou Mo <cdluminate at gmail.com>
Date: Tue Mar 14 08:13:25 2017 +0000
New upstream version 0~20170304-g329dff5
---
TensorMath.lua | 60 +++
doc/maths.md | 213 +++++++++
generic/Tensor.c | 1 -
lib/TH/CMakeLists.txt | 220 +++++----
lib/TH/README.md | 7 +
lib/TH/THGeneral.h.in | 7 -
lib/TH/THStorage.c | 53 +++
lib/TH/THStorage.h | 8 +
lib/TH/THTensor.c | 1 +
lib/TH/THTensor.h | 5 -
lib/TH/THTensorApply.h | 554 ++++++---------------
lib/TH/THVector.c | 8 +
lib/TH/cmake/FindMKL.cmake | 2 +-
lib/TH/cmake/FindSSE.cmake | 16 +-
lib/TH/generic/THTensor.c | 155 +++---
lib/TH/generic/THTensor.h | 8 +-
lib/TH/generic/THTensorConv.c | 10 +-
lib/TH/generic/THTensorCopy.c | 13 +-
lib/TH/generic/THTensorMath.c | 978 +++++++++++++++++++++++++++-----------
lib/TH/generic/THTensorMath.h | 10 +
lib/TH/generic/THVector.h | 11 +-
lib/TH/generic/THVectorDefault.c | 105 ++--
lib/TH/generic/THVectorDispatch.c | 171 +++++--
lib/TH/generic/simd/convolve.c | 6 +-
lib/TH/generic/simd/simd.h | 36 +-
lib/TH/vector/AVX.c | 274 +++++++++++
lib/TH/vector/AVX.h | 23 +
lib/TH/vector/AVX2.c | 47 ++
lib/TH/vector/AVX2.h | 9 +
lib/TH/vector/NEON.c | 81 ++--
lib/TH/vector/SSE.c | 259 ++++++----
test/test.lua | 749 ++++++++++++++++++++---------
32 files changed, 2829 insertions(+), 1271 deletions(-)
diff --git a/TensorMath.lua b/TensorMath.lua
index d816740..53838ae 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -311,6 +311,18 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
{name=Tensor, method={default=1}},
{name=real}})
+ wrap("lshift",
+ cname("lshift"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
+ wrap("rshift",
+ cname("rshift"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
wrap("fmod",
cname("fmod"),
{{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -323,6 +335,24 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
{name=Tensor, method={default=1}},
{name=real}})
+ wrap("bitand",
+ cname("bitand"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
+ wrap("bitor",
+ cname("bitor"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
+ wrap("bitxor",
+ cname("bitxor"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
-- mod alias
wrap("mod",
cname("fmod"),
@@ -364,6 +394,18 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
{name=Tensor, method={default=1}},
{name=Tensor}})
+ wrap("clshift",
+ cname("clshift"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=Tensor}})
+
+ wrap("crshift",
+ cname("crshift"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=Tensor}})
+
wrap("cfmod",
cname("cfmod"),
{{name=Tensor, default=true, returned=true, method={default='nil'}},
@@ -376,6 +418,24 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
{name=Tensor, method={default=1}},
{name=Tensor}})
+ wrap("cbitand",
+ cname("cbitand"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=Tensor}})
+
+ wrap("cbitor",
+ cname("cbitor"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=Tensor}})
+
+ wrap("cbitxor",
+ cname("cbitxor"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=Tensor}})
+
-- cmod alias
wrap("cmod",
cname("cfmod"),
diff --git a/doc/maths.md b/doc/maths.md
index b4f1592..eb9f5cf 100755
--- a/doc/maths.md
+++ b/doc/maths.md
@@ -860,6 +860,87 @@ The number of elements must match, but sizes do not matter.
`z:cdiv(x, y)` puts the result in `z`.
+<a name="torch.lshift"></a>
+### [res] torch.lshift([res,] tensor, value) ###
+<a name="torch.lshift"></a>
+
+Left shift all elements in the `Tensor` by the given `value`.
+
+`z = torch.lshift(x, 2)` will return a new `Tensor` with the result of `x << 2`.
+
+`torch.lshift(z, x, 2)` will put the result of `x << 2` in `z`.
+
+`x:lshift(2)` will perform left shift operation all elements of `x` by `2` bits.
+
+`z:lshift(x, 2)` puts the result of `x << 2` in `z`.
+
+Note: For float type tensors, `x:lshift(value)` evaluates `x:mul(math.pow(2, value))` internally.
+
+<a name="torch.clshift"></a>
+### [res] torch.clshift([res,] tensor1, tensor2) ###
+<a name="torch.clshift"></a>
+
+Performs the left shift operation of each element in `tensor1` by each element in `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.LongTensor(2, 2):fill(1)
+> y = torch.LongTensor(2, 2):range(1, 4)
+> x:clshift(y)
+> x
+ 2 4
+ 8 16
+[torch.LongTensor of size 2x2]
+```
+
+`z = torch.clshift(x, y)` returns a new `Tensor`.
+
+`torch.clshift(z, x, y)` puts the result in `z`.
+
+`y:clshift(x)` left shifts all elements of `y` with corresponding elements of `x`.
+
+`z:clshift(x, y)` puts the result in `z`.
+
+<a name="torch.rshift"></a>
+### [res] torch.rshift([res,] tensor, value) ###
+<a name="torch.rshift"></a>
+
+Right shift all elements in the `Tensor` by the given `value`.
+
+`z = torch.rshift(x, 2)` will return a new `Tensor` with the result of `x >> 2`.
+
+`torch.rshift(z, x, 2)` will put the result of `x >> 2` in `z`.
+
+`x:rshift(2)` will perform right shift operation all elements of `x` by `2` bits.
+
+`z:rshift(x, 2)` puts the result of `x >> 2` in `z`.
+
+Note: For float type tensors, `x:lshift(value)` evaluates `x:div(math.pow(2, value))` internally.
+
+<a name="torch.crshift"></a>
+### [res] torch.crshift([res,] tensor1, tensor2) ###
+<a name="torch.crshift"></a>
+
+Performs the right shift operation of each element in `tensor1` by each element in `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.LongTensor(2, 2):fill(32)
+> y = torch.LongTensor(2, 2):range(1, 4)
+> x:crshift(y)
+> x
+ 16 8
+ 4 2
+[torch.LongTensor of size 2x2]
+```
+
+`z = torch.crshift(x, y)` returns a new `Tensor`.
+
+`torch.crshift(z, x, y)` puts the result in `z`.
+
+`y:crshift(x)` right shifts all elements of `y` with corresponding elements of `x`.
+
+`z:crshift(x, y)` puts the result in `z`.
<a name="torch.addcdiv"></a>
### [res] torch.addcdiv([res,] x [,value], tensor1, tensor2) ###
@@ -1006,6 +1087,138 @@ corresponding elements of `x`.
This function is deprecated and exists only for compatibility with previous versions. Please use `torch.cfmod()` or `torch.cremainder()` instead.
+<a name="torch.bitand"></a>
+### [res] torch.bitand([res,] tensor, value) ###
+<a name="torch.bitand"></a>
+
+Performs bitwise `and` operation on all elements in the `Tensor` by the given `value`.
+
+`z = torch.bitand(x, value)` will return a new `Tensor` with the result of `x & value`.
+
+`torch.bitand(z, x, value)` will put the result of `x & value` in `z`.
+
+`x:bitand(value)` will perform right shift operation all elements of `x` by `value` bits.
+
+`z:bitand(x, value)` puts the result of `x & value` in `z`.
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
+
+<a name="torch.cbitand"></a>
+### [res] torch.cbitand([res,] tensor1, tensor2) ###
+<a name="torch.cbitand"></a>
+
+Performs bitwise `and` operation of each element in `tensor1` by each element in `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.LongTensor(4):fill(6)
+> y = torch.LongTensor{1, 2, 4, 8}
+> x:cbitand(y)
+> x
+ 0
+ 2
+ 4
+ 0
+[torch.LongTensor of size 4]
+```
+`z = torch.cbitand(x, y)` returns a new `Tensor`.
+
+`torch.cbitand(z, x, y)` puts the result in `z`.
+
+`y:cbitand(x)` performs bitwise `and` all elements of `y` with corresponding elements of `x`.
+
+`z:cbitand(x, y)` puts the result in `z`.
+
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
+
+<a name="torch.bitor"></a>
+### [res] torch.bitor([res,] tensor, value) ###
+<a name="torch.bitor"></a>
+
+Performs bitwise `or` operation on all elements in the `Tensor` by the given `value`.
+
+`z = torch.bitor(x, value)` will return a new `Tensor` with the result of `x & value`.
+
+`torch.bitor(z, x, value)` will put the result of `x | value` in `z`.
+
+`x:bitor(value)` will perform right shift operation all elements of `x` by `value` bits.
+
+`z:bitor(x, value)` puts the result of `x | value` in `z`.
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
+
+<a name="torch.cbitor"></a>
+### [res] torch.cbitor([res,] tensor1, tensor2) ###
+<a name="torch.cbitor"></a>
+
+Performs bitwise `or` operation of each element in `tensor1` by each element in `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.LongTensor(4):fill(3)
+> y = torch.LongTensor{1, 2, 4, 8}
+> x:cbitor(y)
+> x
+ 3
+ 3
+ 7
+ 11
+[torch.LongTensor of size 4]
+```
+`z = torch.cbitor(x, y)` returns a new `Tensor`.
+
+`torch.cbitor(z, x, y)` puts the result in `z`.
+
+`y:cbitor(x)` performs bitwise `or` all elements of `y` with corresponding elements of `x`.
+
+`z:cbitor(x, y)` puts the result in `z`.
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
+
+<a name="torch.bitxor"></a>
+### [res] torch.bitxor([res,] tensor, value) ###
+<a name="torch.bitxor"></a>
+
+Performs bitwise `xor` operation on all elements in the `Tensor` by the given `value`.
+
+`z = torch.bitxor(x, value)` will return a new `Tensor` with the result of `x & value`.
+
+`torch.bitxor(z, x, value)` will put the result of `x ^ value` in `z`.
+
+`x:bitxor(value)` will perform right shift operation all elements of `x` by `value` bits.
+
+`z:bitxor(x, value)` puts the result of `x ^ value` in `z`.
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
+
+<a name="torch.cbitxor"></a>
+### [res] torch.cbitxor([res,] tensor1, tensor2) ###
+<a name="torch.cbitxor"></a>
+
+Performs bitwise `xor` operation of each element in `tensor1` by each element in `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.LongTensor(4):fill(15)
+> y = torch.LongTensor{1, 2, 4, 8}
+> x:cbitxor(y)
+> x
+ 14
+ 13
+ 11
+ 7
+[torch.LongTensor of size 4]
+```
+`z = torch.cbitxor(x, y)` returns a new `Tensor`.
+
+`torch.cbitxor(z, x, y)` puts the result in `z`.
+
+`y:cbitxor(x)` performs bitwise `xor` all elements of `y` with corresponding elements of `x`.
+
+`z:cbitxor(x, y)` puts the result in `z`.
+
+Note: This function is only supported for [Int|Long|Byte]Tensors
<a name="torch.dot"></a>
### [number] torch.dot(tensor1, tensor2) ###
diff --git a/generic/Tensor.c b/generic/Tensor.c
index c2417fe..aabbbdc 100644
--- a/generic/Tensor.c
+++ b/generic/Tensor.c
@@ -1355,7 +1355,6 @@ void torch_Tensor_(init)(lua_State *L)
#ifndef TH_REAL_IS_HALF
THVector_(vectorDispatchInit)();
#endif
-
}
#endif
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index 3f66edc..8aeb204 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -20,11 +20,21 @@ IF(NOT TH_INSTALL_BIN_SUBDIR
SET(TH_INSTALL_CMAKE_SUBDIR "share/cmake/TH" CACHE PATH "TH install cmake subdirectory")
ENDIF()
-# flags
+#######################################################################
+##### flags section
+######################################################################
IF(MSVC)
- # respect the standard
- ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+ # MSVC now supports C99 since VS2013/VS2015
+ SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /std:c99")
+ELSE(MSVC)
+ # enable gnu99 and not c99 because we use
+ # gnu extensions like posix_memalign
+ SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu99")
+ENDIF(MSVC)
+
+IF(MSVC)
+ ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1) # respect the standard
ENDIF(MSVC)
IF(UNIX)
@@ -95,70 +105,46 @@ IF(HAVE_GCC_GET_CPUID)
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_GCC_GET_CPUID")
ENDIF(HAVE_GCC_GET_CPUID)
-FIND_PACKAGE(SSE)
+CHECK_C_SOURCE_COMPILES("#include <stdint.h>
+ static inline void cpuid(uint32_t *eax, uint32_t *ebx,
+ uint32_t *ecx, uint32_t *edx)
+ {
+ uint32_t a = *eax, b, c = *ecx, d;
+ asm volatile ( \"cpuid\" : \"+a\"(a), \"=b\"(b), \"+c\"(c), \"=d\"(d) );
+ *eax = a; *ebx = b; *ecx = c; *edx = d;
+ }
+ int main() {
+ uint32_t a,b,c,d;
+ cpuid(&a, &b, &c, &d);
+ return 0;
+ }" NO_GCC_EBX_FPIC_BUG)
+
+IF(NOT NO_GCC_EBX_FPIC_BUG)
+ SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GCC_GET_CPUID")
+ENDIF(NOT NO_GCC_EBX_FPIC_BUG)
+
+
+FIND_PACKAGE(SSE) # checks SSE, AVX and AVX2
IF(C_SSE2_FOUND)
+ MESSAGE(STATUS "SSE2 Found")
SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}")
ENDIF(C_SSE2_FOUND)
IF(C_SSE3_FOUND)
+ MESSAGE(STATUS "SSE3 Found")
SET(CMAKE_C_FLAGS "${C_SSE3_FLAGS} -DUSE_SSE3 ${CMAKE_C_FLAGS}")
ENDIF(C_SSE3_FOUND)
-
-IF(C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
- SET(simd generic/simd/convolve.c)
- IF(MSVC)
- SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve.c PROPERTIES COMPILE_FLAGS "/std:c99")
- ELSE(MSVC)
- SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve.c PROPERTIES COMPILE_FLAGS "-std=c99")
- ENDIF(MSVC)
-ENDIF(C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
-
-IF(C_SSE4_1_FOUND)
- SET(CMAKE_C_FLAGS "${C_SSE4_1_FLAGS} -DUSE_SSE4_1 ${CMAKE_C_FLAGS}")
-ENDIF(C_SSE4_1_FOUND)
-IF(C_SSE4_2_FOUND)
- SET(CMAKE_C_FLAGS "${C_SSE4_2_FLAGS} -DUSE_SSE4_2 ${CMAKE_C_FLAGS}")
-ENDIF(C_SSE4_2_FOUND)
-
-IF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND)
- IF(MSVC)
- SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast /std:c99")
- ELSE(MSVC)
- SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math -std=c99")
- ENDIF(MSVC)
- SET(simd ${simd} generic/simd/convolve5x5_sse.c)
-ENDIF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND)
-
+# we dont set -mavx and -mavx2 flags globally, but only for specific files
+# however, we want to enable the AVX codepaths, so we still need to
+# add USE_AVX and USE_AVX2 macro defines
IF(C_AVX_FOUND)
+ MESSAGE(STATUS "AVX Found")
SET(CMAKE_C_FLAGS "-DUSE_AVX ${CMAKE_C_FLAGS}")
- IF(MSVC)
- SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast /arch:AVX /std:c99")
- ELSE(MSVC)
- SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math -mavx -std=c99")
- ENDIF(MSVC)
- SET(simd ${simd} generic/simd/convolve5x5_avx.c)
ENDIF(C_AVX_FOUND)
+IF(C_AVX2_FOUND)
+ MESSAGE(STATUS "AVX2 Found")
+ SET(CMAKE_C_FLAGS "-DUSE_AVX2 ${CMAKE_C_FLAGS}")
+ENDIF(C_AVX2_FOUND)
-SET(hdr
- THGeneral.h THHalf.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
- THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h )
-
-SET(src
- THGeneral.c THHalf.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
- THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
-
-SET(src ${src} ${hdr} ${simd})
-ADD_LIBRARY(TH SHARED ${src})
-if(BUILD_STATIC)
- ADD_LIBRARY(TH_static STATIC ${src})
-endif()
-
-IF(NOT TH_SO_VERSION)
- SET(TH_SO_VERSION 0)
-ENDIF(NOT TH_SO_VERSION)
-MESSAGE(STATUS "TH_SO_VERSION: ${TH_SO_VERSION}")
-SET_TARGET_PROPERTIES(TH PROPERTIES
- VERSION ${TH_SO_VERSION}
- SOVERSION ${TH_SO_VERSION})
CHECK_C_SOURCE_RUNS("
#include <stdatomic.h>
@@ -202,6 +188,74 @@ int main()
" HAS_GCC_ATOMICS)
ENDIF()
+#######################################################################
+##### sources section
+######################################################################
+
+# IF ANY SIMD FOUND
+IF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
+ SET(simd generic/simd/convolve.c)
+ENDIF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
+
+# IF SSE4 FOUND
+IF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
+ SET(CMAKE_C_FLAGS "${C_SSE4_1_FLAGS} -DUSE_SSE4_1 ${C_SSE4_2_FLAGS} -DUSE_SSE4_2 ${CMAKE_C_FLAGS}")
+ IF(MSVC)
+ SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast")
+ ELSE(MSVC)
+ SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math")
+ ENDIF(MSVC)
+ SET(simd ${simd} generic/simd/convolve5x5_sse.c)
+ENDIF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
+
+# IF AVX FOUND
+IF(C_AVX_FOUND)
+ IF(MSVC)
+ SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast ${C_AVX_FLAGS}")
+ SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "/Ox ${C_AVX_FLAGS}")
+ ELSE(MSVC)
+ SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math ${C_AVX_FLAGS}")
+ SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX_FLAGS}")
+ ENDIF(MSVC)
+ SET(simd ${simd} vector/AVX.c generic/simd/convolve5x5_avx.c)
+ENDIF(C_AVX_FOUND)
+
+IF(C_AVX2_FOUND)
+ IF(MSVC)
+ SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "/Ox ${C_AVX2_FLAGS}")
+ ELSE(MSVC)
+ SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX2_FLAGS}")
+ ENDIF(MSVC)
+ SET(simd ${simd} vector/AVX2.c)
+ENDIF(C_AVX2_FOUND)
+
+SET(hdr
+ THGeneral.h THHalf.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
+ THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h )
+
+SET(src
+ THGeneral.c THHalf.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
+ THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
+
+SET(src ${src} ${hdr} ${simd})
+
+#######################################################################
+##### build section
+######################################################################
+
+ADD_LIBRARY(TH SHARED ${src})
+if(BUILD_STATIC)
+ ADD_LIBRARY(TH_static STATIC ${src})
+endif()
+
+IF(NOT TH_SO_VERSION)
+ SET(TH_SO_VERSION 0)
+ENDIF(NOT TH_SO_VERSION)
+MESSAGE(STATUS "TH_SO_VERSION: ${TH_SO_VERSION}")
+SET_TARGET_PROPERTIES(TH PROPERTIES
+ VERSION ${TH_SO_VERSION}
+ SOVERSION ${TH_SO_VERSION})
+
IF(HAS_C11_ATOMICS)
ADD_DEFINITIONS(-DUSE_C11_ATOMICS=1)
MESSAGE(STATUS "Atomics: using C11 intrinsics")
@@ -233,10 +287,6 @@ IF(LAPACK_FOUND)
TARGET_LINK_LIBRARIES(TH ${LAPACK_LIBRARIES})
ENDIF(LAPACK_FOUND)
-IF(BLAS_IS_ACCELERATE)
- MESSAGE(STATUS "BLAS FOUND IS ACCELERATE: Fix for sdot")
-ENDIF()
-
IF (UNIX AND NOT APPLE)
INCLUDE(CheckLibraryExists)
# https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830
@@ -253,6 +303,7 @@ IF(UNIX)
IF(HAVE_MMAP)
ADD_DEFINITIONS(-DHAVE_MMAP=1)
ENDIF(HAVE_MMAP)
+ # done for lseek: https://www.gnu.org/software/libc/manual/html_node/File-Position-Primitive.html
ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
IF(HAVE_SHM_OPEN)
@@ -268,47 +319,10 @@ IF(UNIX)
ENDIF(HAVE_MALLOC_USABLE_SIZE)
ENDIF(UNIX)
-
-
IF(NOT MSVC)
TARGET_LINK_LIBRARIES(TH m)
ENDIF(NOT MSVC)
-SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-FOREACH(KEYWORD "inline" "__inline__" "__inline")
- IF(NOT DEFINED C_INLINE)
-
- SET(CMAKE_REQUIRED_FLAGS "-Dinline=${KEYWORD} ${CMAKE_C_FLAGS}")
- CHECK_C_SOURCE_RUNS("
- static inline int static_foo()
- {
- return 0;
- }
-
- int main(int argc, char *argv[])
- {
- static_foo();
- return 0;
- }" C_HAS_${KEYWORD})
-
- IF(C_HAS_${KEYWORD})
- SET(C_INLINE TRUE)
-# Right now i put it in THGeneral.h -- debatable
-# ADD_DEFINITIONS("-Dinline=${KEYWORD}")
- SET(TH_INLINE ${KEYWORD})
- MESSAGE(STATUS "C inline is supported (${KEYWORD})")
- ENDIF(C_HAS_${KEYWORD})
- ENDIF(NOT DEFINED C_INLINE)
-ENDFOREACH(KEYWORD)
-SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
-
-IF(NOT DEFINED C_INLINE)
- MESSAGE(STATUS "C inline seems not supported")
-# Right now i put it in THGeneral.h -- debatable
-# ADD_DEFINITIONS("-Dinline=")
-SET(TH_INLINE "")
-ENDIF(NOT DEFINED C_INLINE)
-
# Is __thread supported?
IF(NOT MSVC)
CHECK_C_SOURCE_COMPILES("static __thread int x = 1; int main() { return x; }" C_HAS_THREAD)
@@ -324,6 +338,11 @@ ENDIF(NOT C_HAS_THREAD)
INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
CONFIGURE_FILE(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h")
+
+#######################################################################
+##### install section
+######################################################################
+
INSTALL(TARGETS TH
EXPORT TH-exports
RUNTIME DESTINATION "${TH_INSTALL_BIN_SUBDIR}"
@@ -358,6 +377,11 @@ INSTALL(FILES
DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH")
INSTALL(FILES
+ vector/AVX.h
+ vector/AVX2.h
+ DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH/vector")
+
+INSTALL(FILES
generic/THBlas.c
generic/THBlas.h
generic/THLapack.c
diff --git a/lib/TH/README.md b/lib/TH/README.md
new file mode 100644
index 0000000..c646ce9
--- /dev/null
+++ b/lib/TH/README.md
@@ -0,0 +1,7 @@
+Environment variables control the disabling of certain explicit SIMD optimizations.
+
+```
+TH_NO_AVX2=1 # disable AVX2 codepaths
+TH_NO_AVX=1 # disable AVX codepaths
+TH_NO_SSE=1 # disable SSE codepaths
+```
\ No newline at end of file
diff --git a/lib/TH/THGeneral.h.in b/lib/TH/THGeneral.h.in
index bc7e448..de11f1b 100644
--- a/lib/TH/THGeneral.h.in
+++ b/lib/TH/THGeneral.h.in
@@ -13,7 +13,6 @@
#cmakedefine USE_BLAS
#cmakedefine USE_LAPACK
-#cmakedefine BLAS_IS_ACCELERATE
#cmakedefine BLAS_F2C
#ifdef __cplusplus
@@ -32,12 +31,6 @@
# define TH_API TH_EXTERNC
#endif
-#define TH_INLINE @TH_INLINE@
-
-#ifndef __cplusplus
-#define inline @TH_INLINE@
-#endif
-
#ifndef M_PI
# define M_PI 3.14159265358979323846
#endif
diff --git a/lib/TH/THStorage.c b/lib/TH/THStorage.c
index bb63a43..9c48e77 100644
--- a/lib/TH/THStorage.c
+++ b/lib/TH/THStorage.c
@@ -12,3 +12,56 @@
#include "generic/THStorageCopy.c"
#include "THGenerateHalfType.h"
+
+
+THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) {
+ const int L = TH_DESC_BUFF_LEN;
+ THDescBuff buf;
+ char *str = buf.str;
+ int n = 0;
+ n += snprintf(str, L-n, "[");
+ int i;
+ for(i = 0; i < size->size; i++) {
+ if(n >= L) break;
+ n += snprintf(str+n, L-n, "%ld", size->data[i]);
+ if(i < size->size-1) {
+ n += snprintf(str+n, L-n, " x ");
+ }
+ }
+ if(n < L - 2) {
+ snprintf(str+n, L-n, "]");
+ } else {
+ snprintf(str+L-5, 5, "...]");
+ }
+ return buf;
+}
+
+TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement)
+{
+ ptrdiff_t total_size = (size->size > 0 ? 1 : 0);
+ ptrdiff_t dim_infer = -1;
+ ptrdiff_t i;
+ for (i = 0; i < size->size; i++) {
+ if (size->data[i] == -1) {
+ THArgCheck(dim_infer == -1, 1, "only one dimension can be inferred");
+ dim_infer = i;
+ } else {
+ total_size *= size->data[i];
+ }
+ }
+ if (dim_infer != -1) {
+ THDescBuff buf = THLongStorage_sizeDesc(size);
+ THArgCheck(total_size > 0 && nElement % total_size == 0, 2,
+ "size '%s' is invalid for input of with %td elements", buf.str, nElement);
+ } else {
+ THDescBuff buf = THLongStorage_sizeDesc(size);
+ THArgCheck(nElement == total_size, 2,
+ "size '%s' is invalid for input of with %td elements", buf.str, nElement);
+ }
+ THLongStorage* copy = THLongStorage_newWithSize(size->size);
+ THLongStorage_copy(copy, size);
+ if (dim_infer != -1) {
+ copy->data[dim_infer] = nElement / total_size;
+ }
+ return copy;
+}
diff --git a/lib/TH/THStorage.h b/lib/TH/THStorage.h
index 9565e10..df80229 100644
--- a/lib/TH/THStorage.h
+++ b/lib/TH/THStorage.h
@@ -7,6 +7,11 @@
#define THStorage TH_CONCAT_3(TH,Real,Storage)
#define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
+#define TH_DESC_BUFF_LEN 64
+typedef struct {
+ char str[TH_DESC_BUFF_LEN];
+} THDescBuff;
+
/* fast access methods */
#define TH_STORAGE_GET(storage, idx) ((storage)->data[(idx)])
#define TH_STORAGE_SET(storage, idx, value) ((storage)->data[(idx)] = (value))
@@ -23,4 +28,7 @@
#include "generic/THStorageCopy.h"
#include "THGenerateHalfType.h"
+TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size);
+TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement);
+
#endif
diff --git a/lib/TH/THTensor.c b/lib/TH/THTensor.c
index 37071df..115e396 100644
--- a/lib/TH/THTensor.c
+++ b/lib/TH/THTensor.c
@@ -1,6 +1,7 @@
#include "THAtomic.h"
#include "THTensor.h"
#include "THVector.h"
+#include "generic/simd/simd.h"
#include "THBlas.h"
#include "THLapack.h"
diff --git a/lib/TH/THTensor.h b/lib/TH/THTensor.h
index a155efd..d2a1c57 100644
--- a/lib/TH/THTensor.h
+++ b/lib/TH/THTensor.h
@@ -7,11 +7,6 @@
#define THTensor TH_CONCAT_3(TH,Real,Tensor)
#define THTensor_(NAME) TH_CONCAT_4(TH,Real,Tensor_,NAME)
-#define TH_DESC_BUFF_LEN 64
-typedef struct {
- char str[TH_DESC_BUFF_LEN];
-} THDescBuff;
-
/* basics */
#include "generic/THTensor.h"
#include "THGenerateAllTypes.h"
diff --git a/lib/TH/THTensorApply.h b/lib/TH/THTensorApply.h
index 4fd69d4..3e6ed6e 100644
--- a/lib/TH/THTensorApply.h
+++ b/lib/TH/THTensorApply.h
@@ -1,353 +1,6 @@
#ifndef TH_TENSOR_APPLY_INC
#define TH_TENSOR_APPLY_INC
-#define TH_TENSOR_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
-{ \
- TYPE1 *TENSOR1##_data = NULL; \
- long *TENSOR1##_counter = NULL; \
- long TENSOR1##_stride = 0, TENSOR1##_size = 0, TENSOR1##_dim = 0, TENSOR1##_i, TENSOR1##_n; \
- TYPE2 *TENSOR2##_data = NULL; \
- long *TENSOR2##_counter = NULL; \
- long TENSOR2##_stride = 0, TENSOR2##_size = 0, TENSOR2##_dim = 0, TENSOR2##_i, TENSOR2##_n; \
- TYPE3 *TENSOR3##_data = NULL; \
- long *TENSOR3##_counter = NULL; \
- long TENSOR3##_stride = 0, TENSOR3##_size = 0, TENSOR3##_dim = 0, TENSOR3##_i, TENSOR3##_n; \
- int TH_TENSOR_APPLY_hasFinished = 0; \
-\
- TENSOR1##_n = (TENSOR1->nDimension ? 1 : 0); \
- for(TENSOR1##_i = 0; TENSOR1##_i < TENSOR1->nDimension; TENSOR1##_i++) \
- TENSOR1##_n *= TENSOR1->size[TENSOR1##_i]; \
-\
- TENSOR2##_n = (TENSOR2->nDimension ? 1 : 0); \
- for(TENSOR2##_i = 0; TENSOR2##_i < TENSOR2->nDimension; TENSOR2##_i++) \
- TENSOR2##_n *= TENSOR2->size[TENSOR2##_i]; \
-\
- TENSOR3##_n = (TENSOR3->nDimension ? 1 : 0); \
- for(TENSOR3##_i = 0; TENSOR3##_i < TENSOR3->nDimension; TENSOR3##_i++) \
- TENSOR3##_n *= TENSOR3->size[TENSOR3##_i]; \
-\
- if(TENSOR1##_n != TENSOR2##_n || TENSOR1##_n != TENSOR3##_n) /* should we do the check in the function instead? i think so */ \
- THError("inconsistent tensor size"); \
-\
- if(TENSOR1->nDimension == 0) \
- TH_TENSOR_APPLY_hasFinished = 1; \
- else \
- { \
- TENSOR1##_data = TENSOR1->storage->data+TENSOR1->storageOffset; \
- for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
- { \
- if(TENSOR1->size[TENSOR1##_dim] != 1) \
- break; \
- } \
- TENSOR1##_stride = (TENSOR1##_dim == -1 ? 0 : TENSOR1->stride[TENSOR1##_dim]); \
- TENSOR1##_size = 1; \
- for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
- { \
- if(TENSOR1->size[TENSOR1##_dim] != 1) \
- { \
- if(TENSOR1->stride[TENSOR1##_dim] == TENSOR1##_size) \
- TENSOR1##_size *= TENSOR1->size[TENSOR1##_dim]; \
- else \
- break; \
- } \
- } \
- TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(TENSOR1##_dim+1)); \
- for(TENSOR1##_i = 0; TENSOR1##_i <= TENSOR1##_dim; TENSOR1##_i++) \
- TENSOR1##_counter[TENSOR1##_i] = 0; \
-\
- TENSOR2##_data = TENSOR2->storage->data+TENSOR2->storageOffset; \
- for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
- { \
- if(TENSOR2->size[TENSOR2##_dim] != 1) \
- break; \
- } \
- TENSOR2##_stride = (TENSOR2##_dim == -1 ? 0 : TENSOR2->stride[TENSOR2##_dim]); \
- TENSOR2##_size = 1; \
- for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
- { \
- if(TENSOR2->size[TENSOR2##_dim] != 1) \
- { \
- if(TENSOR2->stride[TENSOR2##_dim] == TENSOR2##_size) \
- TENSOR2##_size *= TENSOR2->size[TENSOR2##_dim]; \
- else \
- break; \
- } \
- } \
- TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(TENSOR2##_dim+1)); \
- for(TENSOR2##_i = 0; TENSOR2##_i <= TENSOR2##_dim; TENSOR2##_i++) \
- TENSOR2##_counter[TENSOR2##_i] = 0; \
-\
- TENSOR3##_data = TENSOR3->storage->data+TENSOR3->storageOffset; \
- for(TENSOR3##_dim = TENSOR3->nDimension-1; TENSOR3##_dim >= 0; TENSOR3##_dim--) \
- { \
- if(TENSOR3->size[TENSOR3##_dim] != 1) \
- break; \
- } \
- TENSOR3##_stride = (TENSOR3##_dim == -1 ? 0 : TENSOR3->stride[TENSOR3##_dim]); \
- TENSOR3##_size = 1; \
- for(TENSOR3##_dim = TENSOR3->nDimension-1; TENSOR3##_dim >= 0; TENSOR3##_dim--) \
- { \
- if(TENSOR3->size[TENSOR3##_dim] != 1) \
- { \
- if(TENSOR3->stride[TENSOR3##_dim] == TENSOR3##_size) \
- TENSOR3##_size *= TENSOR3->size[TENSOR3##_dim]; \
- else \
- break; \
- } \
- } \
- TENSOR3##_counter = (long*)THAlloc(sizeof(long)*(TENSOR3##_dim+1)); \
- for(TENSOR3##_i = 0; TENSOR3##_i <= TENSOR3##_dim; TENSOR3##_i++) \
- TENSOR3##_counter[TENSOR3##_i] = 0; \
- } \
-\
- TENSOR1##_i = 0; \
- TENSOR2##_i = 0; \
- TENSOR3##_i = 0; \
- while(!TH_TENSOR_APPLY_hasFinished) \
- { \
- for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size && TENSOR3##_i < TENSOR3##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR3##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride, TENSOR3##_data += TENSOR3##_stride) /* 0 et pas TENSOR##_dim! */ \
- { \
- CODE \
- } \
-\
- if(TENSOR1##_i == TENSOR1##_size) \
- { \
- if(TENSOR1##_dim == -1) \
- break; \
-\
- TENSOR1##_data -= TENSOR1##_size*TENSOR1##_stride; \
- for(TENSOR1##_i = TENSOR1##_dim; TENSOR1##_i >= 0; TENSOR1##_i--) \
- { \
- TENSOR1##_counter[TENSOR1##_i]++; \
- TENSOR1##_data += TENSOR1->stride[TENSOR1##_i]; \
-\
- if(TENSOR1##_counter[TENSOR1##_i] == TENSOR1->size[TENSOR1##_i]) \
- { \
- if(TENSOR1##_i == 0) \
- { \
- TH_TENSOR_APPLY_hasFinished = 1; \
- break; \
- } \
- else \
- { \
- TENSOR1##_data -= TENSOR1##_counter[TENSOR1##_i]*TENSOR1->stride[TENSOR1##_i]; \
- TENSOR1##_counter[TENSOR1##_i] = 0; \
- } \
- } \
- else \
- break; \
- } \
- TENSOR1##_i = 0; \
- } \
-\
- if(TENSOR2##_i == TENSOR2##_size) \
- { \
- if(TENSOR2##_dim == -1) \
- break; \
-\
- TENSOR2##_data -= TENSOR2##_size*TENSOR2##_stride; \
- for(TENSOR2##_i = TENSOR2##_dim; TENSOR2##_i >= 0; TENSOR2##_i--) \
- { \
- TENSOR2##_counter[TENSOR2##_i]++; \
- TENSOR2##_data += TENSOR2->stride[TENSOR2##_i]; \
-\
- if(TENSOR2##_counter[TENSOR2##_i] == TENSOR2->size[TENSOR2##_i]) \
- { \
- if(TENSOR2##_i == 0) \
- { \
- TH_TENSOR_APPLY_hasFinished = 1; \
- break; \
- } \
- else \
- { \
- TENSOR2##_data -= TENSOR2##_counter[TENSOR2##_i]*TENSOR2->stride[TENSOR2##_i]; \
- TENSOR2##_counter[TENSOR2##_i] = 0; \
- } \
- } \
- else \
- break; \
- } \
- TENSOR2##_i = 0; \
- } \
-\
- if(TENSOR3##_i == TENSOR3##_size) \
- { \
- if(TENSOR3##_dim == -1) \
- break; \
-\
- TENSOR3##_data -= TENSOR3##_size*TENSOR3##_stride; \
- for(TENSOR3##_i = TENSOR3##_dim; TENSOR3##_i >= 0; TENSOR3##_i--) \
- { \
- TENSOR3##_counter[TENSOR3##_i]++; \
- TENSOR3##_data += TENSOR3->stride[TENSOR3##_i]; \
-\
- if(TENSOR3##_counter[TENSOR3##_i] == TENSOR3->size[TENSOR3##_i]) \
- { \
- if(TENSOR3##_i == 0) \
- { \
- TH_TENSOR_APPLY_hasFinished = 1; \
- break; \
- } \
- else \
- { \
- TENSOR3##_data -= TENSOR3##_counter[TENSOR3##_i]*TENSOR3->stride[TENSOR3##_i]; \
- TENSOR3##_counter[TENSOR3##_i] = 0; \
- } \
- } \
- else \
- break; \
- } \
- TENSOR3##_i = 0; \
- } \
- } \
- THFree(TENSOR1##_counter); \
- THFree(TENSOR2##_counter); \
- THFree(TENSOR3##_counter); \
-}
-
-#define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
-{ \
- TYPE1 *TENSOR1##_data = NULL; \
- long *TENSOR1##_counter = NULL; \
- long TENSOR1##_stride = 0, TENSOR1##_size = 0, TENSOR1##_dim = 0, TENSOR1##_i, TENSOR1##_n; \
- TYPE2 *TENSOR2##_data = NULL; \
- long *TENSOR2##_counter = NULL; \
- long TENSOR2##_stride = 0, TENSOR2##_size = 0, TENSOR2##_dim = 0, TENSOR2##_i, TENSOR2##_n; \
- int TH_TENSOR_APPLY_hasFinished = 0; \
-\
- TENSOR1##_n = (TENSOR1->nDimension ? 1 : 0); \
- for(TENSOR1##_i = 0; TENSOR1##_i < TENSOR1->nDimension; TENSOR1##_i++) \
- TENSOR1##_n *= TENSOR1->size[TENSOR1##_i]; \
-\
- TENSOR2##_n = (TENSOR2->nDimension ? 1 : 0); \
- for(TENSOR2##_i = 0; TENSOR2##_i < TENSOR2->nDimension; TENSOR2##_i++) \
- TENSOR2##_n *= TENSOR2->size[TENSOR2##_i]; \
-\
- if(TENSOR1##_n != TENSOR2##_n) /* should we do the check in the function instead? i think so */ \
- THError("inconsistent tensor size"); \
-\
- if(TENSOR1->nDimension == 0) \
- TH_TENSOR_APPLY_hasFinished = 1; \
- else \
- { \
- TENSOR1##_data = TENSOR1->storage->data+TENSOR1->storageOffset; \
- for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
- { \
- if(TENSOR1->size[TENSOR1##_dim] != 1) \
- break; \
- } \
- TENSOR1##_stride = (TENSOR1##_dim == -1 ? 0 : TENSOR1->stride[TENSOR1##_dim]); \
- TENSOR1##_size = 1; \
- for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
- { \
- if(TENSOR1->size[TENSOR1##_dim] != 1) \
- { \
- if(TENSOR1->stride[TENSOR1##_dim] == TENSOR1##_size) \
- TENSOR1##_size *= TENSOR1->size[TENSOR1##_dim]; \
- else \
- break; \
- } \
- } \
- TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(TENSOR1##_dim+1)); \
- for(TENSOR1##_i = 0; TENSOR1##_i <= TENSOR1##_dim; TENSOR1##_i++) \
- TENSOR1##_counter[TENSOR1##_i] = 0; \
-\
- TENSOR2##_data = TENSOR2->storage->data+TENSOR2->storageOffset; \
- for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
- { \
- if(TENSOR2->size[TENSOR2##_dim] != 1) \
- break; \
- } \
- TENSOR2##_stride = (TENSOR2##_dim == -1 ? 0 : TENSOR2->stride[TENSOR2##_dim]); \
- TENSOR2##_size = 1; \
- for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
- { \
- if(TENSOR2->size[TENSOR2##_dim] != 1) \
- { \
- if(TENSOR2->stride[TENSOR2##_dim] == TENSOR2##_size) \
- TENSOR2##_size *= TENSOR2->size[TENSOR2##_dim]; \
- else \
- break; \
- } \
- } \
- TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(TENSOR2##_dim+1)); \
- for(TENSOR2##_i = 0; TENSOR2##_i <= TENSOR2##_dim; TENSOR2##_i++) \
- TENSOR2##_counter[TENSOR2##_i] = 0; \
- } \
-\
- TENSOR1##_i = 0; \
- TENSOR2##_i = 0; \
- while(!TH_TENSOR_APPLY_hasFinished) \
- { \
- for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \
- { \
- CODE \
- } \
-\
- if(TENSOR1##_i == TENSOR1##_size) \
- { \
- if(TENSOR1##_dim == -1) \
- break; \
-\
- TENSOR1##_data -= TENSOR1##_size*TENSOR1##_stride; \
- for(TENSOR1##_i = TENSOR1##_dim; TENSOR1##_i >= 0; TENSOR1##_i--) \
- { \
- TENSOR1##_counter[TENSOR1##_i]++; \
- TENSOR1##_data += TENSOR1->stride[TENSOR1##_i]; \
-\
- if(TENSOR1##_counter[TENSOR1##_i] == TENSOR1->size[TENSOR1##_i]) \
- { \
- if(TENSOR1##_i == 0) \
- { \
- TH_TENSOR_APPLY_hasFinished = 1; \
- break; \
- } \
- else \
- { \
- TENSOR1##_data -= TENSOR1##_counter[TENSOR1##_i]*TENSOR1->stride[TENSOR1##_i]; \
- TENSOR1##_counter[TENSOR1##_i] = 0; \
- } \
- } \
- else \
- break; \
- } \
- TENSOR1##_i = 0; \
- } \
-\
- if(TENSOR2##_i == TENSOR2##_size) \
- { \
- if(TENSOR2##_dim == -1) \
- break; \
-\
- TENSOR2##_data -= TENSOR2##_size*TENSOR2##_stride; \
- for(TENSOR2##_i = TENSOR2##_dim; TENSOR2##_i >= 0; TENSOR2##_i--) \
- { \
- TENSOR2##_counter[TENSOR2##_i]++; \
- TENSOR2##_data += TENSOR2->stride[TENSOR2##_i]; \
-\
- if(TENSOR2##_counter[TENSOR2##_i] == TENSOR2->size[TENSOR2##_i]) \
- { \
- if(TENSOR2##_i == 0) \
- { \
- TH_TENSOR_APPLY_hasFinished = 1; \
- break; \
- } \
- else \
- { \
- TENSOR2##_data -= TENSOR2##_counter[TENSOR2##_i]*TENSOR2->stride[TENSOR2##_i]; \
- TENSOR2##_counter[TENSOR2##_i] = 0; \
- } \
- } \
- else \
- break; \
- } \
- TENSOR2##_i = 0; \
- } \
- } \
- THFree(TENSOR1##_counter); \
- THFree(TENSOR2##_counter); \
-}
-
/*
* The basic strategy for apply is as follows:
*
@@ -370,95 +23,198 @@
* Tensor. But because we are guaranteed the subsequent data is contiguous in memory, we
* can simply loop for sizeof(A) iterations and perform the operation, without having to
* follow the order described by the strides of A.
+ *
+ * 3. As an optimization, we merge dimensions of A that are contiguous in memory. For
+ * example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor, then the first two
+ * dimensions can be merged for the purposes of APPLY, reducing the number of nested
+ * loops.
*/
-#define TH_TENSOR_APPLY(TYPE, TENSOR, CODE) \
-{ \
+
+#define __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, ALLOW_CONTIGUOUS) \
TYPE *TENSOR##_data = NULL; \
- long *TENSOR##_counter = NULL; \
- long TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i; \
- int TH_TENSOR_APPLY_hasFinished = 0; \
+ long *TENSOR##_counter = NULL, *TENSOR##_sizes = NULL, *TENSOR##_strides = NULL, *TENSOR##_dimOffset = NULL; \
+ long TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i, TENSOR##_n; \
+ int TENSOR##_contiguous = ALLOW_CONTIGUOUS; \
+ TENSOR##_n = (TENSOR->nDimension ? 1 : 0); \
+ for(TENSOR##_i = 0; TENSOR##_i < TENSOR->nDimension; TENSOR##_i++) \
+ TENSOR##_n *= TENSOR->size[TENSOR##_i]; \
\
if(TENSOR->nDimension == 0) \
TH_TENSOR_APPLY_hasFinished = 1; \
else \
{ \
TENSOR##_data = TENSOR->storage->data+TENSOR->storageOffset; \
-\
- /* what is the first stride (ignore first dims=1)? */ \
- /* it will be used for offset updates while looping through the largest contiguous section */ \
- for(TENSOR##_dim = TENSOR->nDimension-1; TENSOR##_dim >= 0; TENSOR##_dim--) \
- { \
- if(TENSOR->size[TENSOR##_dim] != 1) \
- break; \
- } \
- TENSOR##_stride = (TENSOR##_dim == -1 ? 0 : TENSOR->stride[TENSOR##_dim]); \
-\
- /* what is the largest contiguous section? size will store the size of this section */ \
TENSOR##_size = 1; \
- for(TENSOR##_dim = TENSOR->nDimension-1; TENSOR##_dim >= 0; TENSOR##_dim--) \
- { \
- if(TENSOR->size[TENSOR##_dim] != 1) \
- { \
- if(TENSOR->stride[TENSOR##_dim] == TENSOR##_size) \
- TENSOR##_size *= TENSOR->size[TENSOR##_dim]; \
- else \
+ TENSOR##_stride = 1; \
+ for(TENSOR##_i = TENSOR->nDimension-1; TENSOR##_i >= 0; TENSOR##_i--) { \
+ if(TENSOR->size[TENSOR##_i] != 1) { \
+ if(TENSOR->stride[TENSOR##_i] == TENSOR##_size && TENSOR##_i != DIM) \
+ TENSOR##_size *= TENSOR->size[TENSOR##_i]; \
+ else{ \
+ TENSOR##_contiguous = 0; \
break; \
+ } \
} \
} \
-\
- /* allocate an array of k+1 elements, where k is the first index that */ \
- /* break contiguity. Note that if the tensor is contiguous, then k is -1 and */ \
- /* this counter array is empty. */ \
-\
- /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \
- /* storage is given by storage_offset + (i * j), where i is the stride */ \
- /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \
- TENSOR##_counter = (long*)THAlloc(sizeof(long)*(TENSOR##_dim+1)); \
- for(TENSOR##_i = 0; TENSOR##_i <= TENSOR##_dim; TENSOR##_i++) \
- TENSOR##_counter[TENSOR##_i] = 0; \
+ if (!TENSOR##_contiguous) { \
+ /* Find the dimension of contiguous sections */ \
+ TENSOR##_dim = 1; \
+ for(TENSOR##_i = TENSOR->nDimension-2; TENSOR##_i >= 0; TENSOR##_i--) \
+ { \
+ if(TENSOR->stride[TENSOR##_i] != TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] || TENSOR##_i == DIM) \
+ TENSOR##_dim++; \
+ } \
+ /* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \
+ TENSOR##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR##_dim)); \
+ TENSOR##_sizes = TENSOR##_counter + TENSOR##_dim; \
+ TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \
+ TH_TENSOR_dim_index = TENSOR##_dim-1; \
+ TENSOR##_dimOffset = &TENSOR##_counter[DIM]; \
+ TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR->nDimension-1]; \
+ TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR->nDimension-1]; \
+ /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \
+ /* storage is given by storage_offset + (i * j), where i is the stride */ \
+ /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \
+ for(TENSOR##_i = TENSOR##_dim-1; TENSOR##_i >= 0; --TENSOR##_i) { \
+ TENSOR##_counter[TENSOR##_i] = 0; \
+ } \
+ for(TENSOR##_i = TENSOR->nDimension-2; TENSOR##_i >= 0; --TENSOR##_i) { \
+ if (TENSOR->stride[TENSOR##_i] == TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] && TENSOR##_i != DIM) { \
+ TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i] * TENSOR##_sizes[TH_TENSOR_dim_index]; \
+ if (TENSOR##_i < DIM) \
+ TENSOR##_dimOffset--; \
+ } else { \
+ --TH_TENSOR_dim_index; \
+ TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i]; \
+ TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR##_i]; \
+ } \
+ } \
+ /* Size of the inner most section */ \
+ TENSOR##_size = TENSOR##_sizes[TENSOR##_dim-1]; \
+ /* Stride of the inner most section */ \
+ TENSOR##_stride = TENSOR##_strides[TENSOR##_dim-1]; \
+ } \
} \
-\
- while(!TH_TENSOR_APPLY_hasFinished) \
+ TENSOR##_i = 0;
+
+#define __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, ALWAYS_UPDATE) \
+ if(TENSOR##_i == TENSOR##_size || ALWAYS_UPDATE) \
{ \
- /* Loop through the contiguous section of the Tensor */ \
- for(TENSOR##_i = 0; TENSOR##_i < TENSOR##_size; TENSOR##_i++, TENSOR##_data += TENSOR##_stride) /* 0 et pas TENSOR##_dim! */ \
- { \
- CODE \
- } \
-\
+ if(TENSOR##_contiguous) \
+ break; \
\
- /* Handle corner case where the entire Tensor was contiguous */ \
- if(TENSOR##_dim == -1) \
+ if(TENSOR##_dim == 1) \
break; \
- \
+\
/* Reset pointer to beginning of loop */ \
- TENSOR##_data -= TENSOR##_i*TENSOR##_stride; \
- for(TENSOR##_i = TENSOR##_dim; TENSOR##_i >= 0; TENSOR##_i--) \
+ TENSOR##_data -= TENSOR##_size*TENSOR##_stride; \
+ for(TENSOR##_i = TENSOR##_dim-2; TENSOR##_i >= 0; TENSOR##_i--) \
{ \
TENSOR##_counter[TENSOR##_i]++; \
-\
/* Jump ahread by the stride of this dimension */ \
- TENSOR##_data += TENSOR->stride[TENSOR##_i]; \
+ TENSOR##_data += TENSOR##_strides[TENSOR##_i]; \
\
- if(TENSOR##_counter[TENSOR##_i] == TENSOR->size[TENSOR##_i]) \
+ if(TENSOR##_counter[TENSOR##_i] == TENSOR##_sizes[TENSOR##_i]) \
{ \
if(TENSOR##_i == 0) \
{ \
TH_TENSOR_APPLY_hasFinished = 1; \
break; \
} \
- else \
+ else \
{ \
/* Reset the pointer to the beginning of the chunk defined by this dimension */ \
- TENSOR##_data -= TENSOR##_counter[TENSOR##_i]*TENSOR->stride[TENSOR##_i]; \
+ TENSOR##_data -= TENSOR##_counter[TENSOR##_i]*TENSOR##_strides[TENSOR##_i]; \
TENSOR##_counter[TENSOR##_i] = 0; \
} \
} \
else \
break; \
} \
+ TENSOR##_i = 0; \
+ } \
+
+#define TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIM, CODE) \
+{ \
+ int TH_TENSOR_APPLY_hasFinished = 0; \
+ long TH_TENSOR_dim_index = 0; \
+ __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
+ __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
+ __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, DIM, 1) \
+\
+ if(TENSOR1##_n != TENSOR2##_n || TENSOR1##_n != TENSOR3##_n) /* should we do the check in the function instead? i think so */ \
+ THError("inconsistent tensor size"); \
+\
+ while(!TH_TENSOR_APPLY_hasFinished) \
+ { \
+ /* Loop through the inner most region of the Tensor */ \
+ for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size && TENSOR3##_i < TENSOR3##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR3##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride, TENSOR3##_data += TENSOR3##_stride) /* 0 et pas TENSOR##_dim! */ \
+ { \
+ CODE \
+ } \
+ __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
+ __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
+ __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR3, 0) \
+ } \
+ if(TENSOR1##_counter != NULL) \
+ THFree(TENSOR1##_counter); \
+ if(TENSOR2##_counter != NULL) \
+ THFree(TENSOR2##_counter); \
+ if(TENSOR3##_counter != NULL) \
+ THFree(TENSOR3##_counter); \
+}
+
+#define TH_TENSOR_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
+ TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, -1, CODE)
+
+#define TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, DIM, CODE) \
+{ \
+ int TH_TENSOR_APPLY_hasFinished = 0; \
+ long TH_TENSOR_dim_index = 0; \
+ __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
+ __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
+\
+ if(TENSOR1##_n != TENSOR2##_n) /* should we do the check in the function instead? i think so */ \
+ THError("inconsistent tensor size"); \
+\
+ while(!TH_TENSOR_APPLY_hasFinished) \
+ { \
+ /* Loop through the inner most region of the Tensor */ \
+ for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \
+ { \
+ CODE \
+ } \
+ __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
+ __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
+ } \
+ if(TENSOR1##_counter != NULL) \
+ THFree(TENSOR1##_counter); \
+ if(TENSOR2##_counter != NULL) \
+ THFree(TENSOR2##_counter); \
+}
+
+#define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+ TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, -1, CODE)
+
+#define TH_TENSOR_APPLY_D(TYPE, TENSOR, DIM, CODE) \
+{ \
+ int TH_TENSOR_APPLY_hasFinished = 0; \
+ long TH_TENSOR_dim_index = 0; \
+ __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, 0) \
+\
+ while(!TH_TENSOR_APPLY_hasFinished) \
+ { \
+ /* Loop through the inner most region of the Tensor */ \
+ for(; TENSOR##_i < TENSOR##_size; TENSOR##_i++, TENSOR##_data += TENSOR##_stride) /* 0 et pas TENSOR##_dim! */ \
+ { \
+ CODE \
+ } \
+ __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, 1) \
} \
THFree(TENSOR##_counter); \
}
+#define TH_TENSOR_APPLY(TYPE, TENSOR, CODE) \
+ TH_TENSOR_APPLY_D(TYPE, TENSOR, -1, CODE)
+
#endif
diff --git a/lib/TH/THVector.c b/lib/TH/THVector.c
index 907adbb..4410578 100644
--- a/lib/TH/THVector.c
+++ b/lib/TH/THVector.c
@@ -15,6 +15,14 @@
#include "vector/SSE.c"
#endif
+#if defined(USE_AVX)
+#include "vector/AVX.h"
+#endif
+
+#if defined(USE_AVX2)
+#include "vector/AVX2.h"
+#endif
+
#include "generic/THVectorDefault.c"
#include "THGenerateAllTypes.h"
diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index e68ae6a..7c9325a 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -50,7 +50,7 @@ ENDIF ("${SIZE_OF_VOIDP}" EQUAL 8)
IF(CMAKE_COMPILER_IS_GNUCC)
SET(mklthreads "mkl_gnu_thread" "mkl_intel_thread")
SET(mklifaces "gf" "intel")
- SET(mklrtls)
+ SET(mklrtls "iomp5")
ELSE(CMAKE_COMPILER_IS_GNUCC)
SET(mklthreads "mkl_intel_thread")
SET(mklifaces "intel")
diff --git a/lib/TH/cmake/FindSSE.cmake b/lib/TH/cmake/FindSSE.cmake
index f6aac07..f84ce89 100644
--- a/lib/TH/cmake/FindSSE.cmake
+++ b/lib/TH/cmake/FindSSE.cmake
@@ -62,12 +62,23 @@ SET(AVX_CODE "
int main()
{
- __m256 a;
+ __m256 a;
a = _mm256_set1_ps(0);
return 0;
}
")
+SET(AVX2_CODE "
+ #include <immintrin.h>
+
+ int main()
+ {
+ __m256i a;
+ a = _mm256_abs_epi16(a);
+ return 0;
+ }
+")
+
MACRO(CHECK_SSE lang type flags)
SET(__FLAG_I 1)
SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
@@ -103,9 +114,12 @@ CHECK_SSE(C "SSE3" " ;-msse3;/arch:SSE3")
CHECK_SSE(C "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
CHECK_SSE(C "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE(C "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
CHECK_SSE(CXX "SSE1" " ;-msse;/arch:SSE")
CHECK_SSE(CXX "SSE2" " ;-msse2;/arch:SSE2")
CHECK_SSE(CXX "SSE3" " ;-msse3;/arch:SSE3")
CHECK_SSE(CXX "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
CHECK_SSE(CXX "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
+CHECK_SSE(CXX "AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE(CXX "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
diff --git a/lib/TH/generic/THTensor.c b/lib/TH/generic/THTensor.c
index 13de6d9..7d88bef 100644
--- a/lib/TH/generic/THTensor.c
+++ b/lib/TH/generic/THTensor.c
@@ -67,8 +67,6 @@ void THTensor_(clearFlag)(THTensor *self, const char flag)
/**** creation methods ****/
static void THTensor_(rawInit)(THTensor *self);
-static void THTensor_(rawSet)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride);
-static void THTensor_(rawResize)(THTensor *self, int nDimension, long *size, long *stride);
/* Empty init */
@@ -84,12 +82,12 @@ THTensor *THTensor_(newWithTensor)(THTensor *tensor)
{
THTensor *self = THAlloc(sizeof(THTensor));
THTensor_(rawInit)(self);
- THTensor_(rawSet)(self,
- tensor->storage,
- tensor->storageOffset,
- tensor->nDimension,
- tensor->size,
- tensor->stride);
+ THTensor_(setStorageNd)(self,
+ tensor->storage,
+ tensor->storageOffset,
+ tensor->nDimension,
+ tensor->size,
+ tensor->stride);
return self;
}
@@ -104,12 +102,12 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset,
#ifdef DEBUG
THAssert((size ? size->size : (stride ? stride->size : 0)) <= INT_MAX);
#endif
- THTensor_(rawSet)(self,
- storage,
- storageOffset,
- (size ? size->size : (stride ? stride->size : 0)),
- (size ? size->data : NULL),
- (stride ? stride->data : NULL));
+ THTensor_(setStorageNd)(self,
+ storage,
+ storageOffset,
+ (size ? size->size : (stride ? stride->size : 0)),
+ (size ? size->data : NULL),
+ (stride ? stride->data : NULL));
return self;
}
@@ -145,7 +143,7 @@ THTensor *THTensor_(newWithStorage4d)(THStorage *storage, ptrdiff_t storageOffse
THTensor *self = THAlloc(sizeof(THTensor));
THTensor_(rawInit)(self);
- THTensor_(rawSet)(self, storage, storageOffset, 4, size, stride);
+ THTensor_(setStorageNd)(self, storage, storageOffset, 4, size, stride);
return self;
}
@@ -176,7 +174,7 @@ THTensor *THTensor_(newWithSize4d)(long size0, long size1, long size2, long size
THTensor *self = THAlloc(sizeof(THTensor));
THTensor_(rawInit)(self);
- THTensor_(rawResize)(self, 4, size, NULL);
+ THTensor_(resizeNd)(self, 4, size, NULL);
return self;
}
@@ -228,6 +226,17 @@ THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, lon
return self;
}
+THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size)
+{
+ THArgCheck(THTensor_(isContiguous)(tensor), 1, "input is not contiguous");
+ ptrdiff_t numel = THTensor_(nElement)(tensor);
+ THTensor *self = THTensor_(new)();
+ THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
+ THTensor_(setStorage)(self, tensor->storage, tensor->storageOffset, inferred_size, NULL);
+ THLongStorage_free(inferred_size);
+ return self;
+}
+
/* Resize */
void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *stride)
{
@@ -238,13 +247,13 @@ void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *strid
#ifdef DEBUG
THAssert(size->size <= INT_MAX);
#endif
- THTensor_(rawResize)(self, size->size, size->data, (stride ? stride->data : NULL));
+ THTensor_(resizeNd)(self, size->size, size->data, (stride ? stride->data : NULL));
}
void THTensor_(resizeAs)(THTensor *self, THTensor *src)
{
if(!THTensor_(isSameSizeAs)(self, src))
- THTensor_(rawResize)(self, src->nDimension, src->size, NULL);
+ THTensor_(resizeNd)(self, src->nDimension, src->size, NULL);
}
void THTensor_(resize1d)(THTensor *tensor, long size0)
@@ -266,25 +275,25 @@ void THTensor_(resize4d)(THTensor *self, long size0, long size1, long size2, lon
{
long size[4] = {size0, size1, size2, size3};
- THTensor_(rawResize)(self, 4, size, NULL);
+ THTensor_(resizeNd)(self, 4, size, NULL);
}
void THTensor_(resize5d)(THTensor *self, long size0, long size1, long size2, long size3, long size4)
{
long size[5] = {size0, size1, size2, size3, size4};
- THTensor_(rawResize)(self, 5, size, NULL);
+ THTensor_(resizeNd)(self, 5, size, NULL);
}
void THTensor_(set)(THTensor *self, THTensor *src)
{
if(self != src)
- THTensor_(rawSet)(self,
- src->storage,
- src->storageOffset,
- src->nDimension,
- src->size,
- src->stride);
+ THTensor_(setStorageNd)(self,
+ src->storage,
+ src->storageOffset,
+ src->nDimension,
+ src->size,
+ src->stride);
}
void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
@@ -295,12 +304,12 @@ void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storag
#ifdef DEBUG
THAssert((size_ ? size_->size : (stride_ ? stride_->size : 0)) <= INT_MAX);
#endif
- THTensor_(rawSet)(self,
- storage_,
- storageOffset_,
- (size_ ? size_->size : (stride_ ? stride_->size : 0)),
- (size_ ? size_->data : NULL),
- (stride_ ? stride_->data : NULL));
+ THTensor_(setStorageNd)(self,
+ storage_,
+ storageOffset_,
+ (size_ ? size_->size : (stride_ ? stride_->size : 0)),
+ (size_ ? size_->data : NULL),
+ (stride_ ? stride_->data : NULL));
}
void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
@@ -346,7 +355,7 @@ void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, ptrdiff_t stor
long size[4] = {size0_, size1_, size2_, size3_};
long stride[4] = {stride0_, stride1_, stride2_, stride3_};
- THTensor_(rawSet)(self, storage_, storageOffset_, 4, size, stride);
+ THTensor_(setStorageNd)(self, storage_, storageOffset_, 4, size, stride);
}
@@ -401,7 +410,7 @@ void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dim
THTensor_(set)(self, src);
if(dimension1 == dimension2)
- return;
+ return;
z = self->stride[dimension1];
self->stride[dimension1] = self->stride[dimension2];
@@ -510,6 +519,57 @@ void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension)
}
}
+void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension)
+{
+ int d;
+
+ if(!src)
+ src = self;
+
+ THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 2, "dimension out of range");
+ THArgCheck(src->nDimension > 0, 2, "cannot unsqueeze empty tensor");
+
+ THTensor_(set)(self, src);
+
+ self->size = (long*)THRealloc(self->size, sizeof(long)*(self->nDimension+1));
+ self->stride = (long*)THRealloc(self->stride, sizeof(long)*(self->nDimension+1));
+ self->nDimension++;
+ for (d = self->nDimension-1; d > dimension; d--) {
+ self->size[d] = self->size[d-1];
+ self->stride[d] = self->stride[d-1];
+ }
+ if (dimension+1 < self->nDimension) {
+ self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+ } else {
+ self->stride[dimension] = 1;
+ }
+ self->size[dimension] = 1;
+}
+
+int THTensor_(isTransposed)(const THTensor *self)
+{
+ if (THTensor_(isContiguous)(self)) {
+ return 0;
+ }
+ long max_stride = 1;
+ long size_max_stride = 1;
+ long z = 1;
+ int d;
+ for (d = 0; d < self->nDimension; ++d) {
+ if (self->stride[d] == 0 && self->size[d] != 1)
+ return 0;
+ if (self->stride[d] > max_stride) {
+ max_stride = self->stride[d];
+ size_max_stride = self->size[d];
+ }
+ z *= self->size[d];
+ }
+ if (z == max_stride * size_max_stride) {
+ return 1;
+ }
+ return 0;
+}
+
int THTensor_(isContiguous)(const THTensor *self)
{
long z = 1;
@@ -632,7 +692,7 @@ static void THTensor_(rawInit)(THTensor *self)
self->flag = TH_TENSOR_REFCOUNTED;
}
-static void THTensor_(rawSet)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
+void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
{
/* storage */
if(self->storage != storage)
@@ -655,10 +715,10 @@ static void THTensor_(rawSet)(THTensor *self, THStorage *storage, ptrdiff_t stor
self->storageOffset = storageOffset;
/* size and stride */
- THTensor_(rawResize)(self, nDimension, size, stride);
+ THTensor_(resizeNd)(self, nDimension, size, stride);
}
-static void THTensor_(rawResize)(THTensor *self, int nDimension, long *size, long *stride)
+void THTensor_(resizeNd)(THTensor *self, int nDimension, long *size, long *stride)
{
int d;
int nDimension_;
@@ -804,24 +864,9 @@ THDescBuff THTensor_(desc)(const THTensor *tensor) {
}
THDescBuff THTensor_(sizeDesc)(const THTensor *tensor) {
- const int L = TH_DESC_BUFF_LEN;
- THDescBuff buf;
- char *str = buf.str;
- int n = 0;
- n += snprintf(str, L-n, "[");
- int i;
- for(i = 0; i < tensor->nDimension; i++) {
- if(n >= L) break;
- n += snprintf(str+n, L-n, "%ld", tensor->size[i]);
- if(i < tensor->nDimension-1) {
- n += snprintf(str+n, L-n, " x ");
- }
- }
- if(n < L - 2) {
- snprintf(str+n, L-n, "]");
- } else {
- snprintf(str+L-5, 5, "...]");
- }
+ THLongStorage *size = THTensor_(newSizeOf)((THTensor*)tensor);
+ THDescBuff buf = THLongStorage_sizeDesc(size);
+ THLongStorage_free(size);
return buf;
}
diff --git a/lib/TH/generic/THTensor.h b/lib/TH/generic/THTensor.h
index 81e3cb0..2fac0a8 100644
--- a/lib/TH/generic/THTensor.h
+++ b/lib/TH/generic/THTensor.h
@@ -11,7 +11,7 @@ typedef struct THTensor
long *size;
long *stride;
int nDimension;
-
+
THStorage *storage;
ptrdiff_t storageOffset;
int refcount;
@@ -68,9 +68,11 @@ TH_API THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, long sli
TH_API THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, long firstIndex_, long size_);
TH_API THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_);
TH_API THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, long step_);
-
+TH_API THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size);
+
TH_API void THTensor_(resize)(THTensor *tensor, THLongStorage *size, THLongStorage *stride);
TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src);
+TH_API void THTensor_(resizeNd)(THTensor *tensor, int nDimension, long *size, long *stride);
TH_API void THTensor_(resize1d)(THTensor *tensor, long size0_);
TH_API void THTensor_(resize2d)(THTensor *tensor, long size0_, long size1_);
TH_API void THTensor_(resize3d)(THTensor *tensor, long size0_, long size1_, long size2_);
@@ -79,6 +81,7 @@ TH_API void THTensor_(resize5d)(THTensor *tensor, long size0_, long size1_, long
TH_API void THTensor_(set)(THTensor *self, THTensor *src);
TH_API void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+TH_API void THTensor_(setStorageNd)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, int nDimension, long *size, long *stride);
TH_API void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
long size0_, long stride0_);
TH_API void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
@@ -101,6 +104,7 @@ TH_API void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension_, lon
TH_API void THTensor_(squeeze)(THTensor *self, THTensor *src);
TH_API void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension_);
+TH_API void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension_);
TH_API int THTensor_(isContiguous)(const THTensor *self);
TH_API int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor *src);
diff --git a/lib/TH/generic/THTensorConv.c b/lib/TH/generic/THTensorConv.c
index 1e21991..684ff9d 100644
--- a/lib/TH/generic/THTensorConv.c
+++ b/lib/TH/generic/THTensorConv.c
@@ -44,7 +44,7 @@ void THTensor_(validXCorr2Dptr)(real *r_,
for (ky = 0; ky < kr; ky++) {
real *pis_ = pi_;
for (kx = 0; kx < kc; kx++) {
- THVector_(add)(r_, pis_, alpha*pw_[kx], oc);
+ THVector_(cadd)(r_, r_, pis_, alpha*pw_[kx], oc);
pis_++;
}
pi_ += ic; /* next input line */
@@ -97,7 +97,7 @@ void THTensor_(validConv2Dptr)(real *r_,
for (ky = 0; ky < kr; ky++) {
real *pis_ = pi_;
for (kx = 0; kx < kc; kx++) {
- THVector_(add)(r_, pis_, alpha*pw_[-kx], oc);
+ THVector_(cadd)(r_, r_, pis_, alpha*pw_[-kx], oc);
pis_++;
}
pi_ += ic; /* next input line */
@@ -149,7 +149,7 @@ void THTensor_(fullConv2Dptr)(real *r_,
for (ky = 0; ky < kr; ky++) {
real *pos_ = po_;
for (kx = 0; kx < kc; kx++) {
- THVector_(add)(pos_, t_, alpha*pw_[kx], ic);
+ THVector_(cadd)(pos_, pos_, t_, alpha*pw_[kx], ic);
pos_++;
}
po_ += oc; /* next input line */
@@ -202,7 +202,7 @@ void THTensor_(fullXCorr2Dptr)(real *r_,
for (ky = 0; ky < kr; ky++) {
real *pos_ = po_;
for (kx = 0; kx < kc; kx++) {
- THVector_(add)(pos_, t_, pw_[-kx]*alpha, ic);
+ THVector_(cadd)(pos_, pos_, t_, pw_[-kx]*alpha, ic);
pos_++;
}
po_ += oc; /* next input line */
@@ -255,7 +255,7 @@ void THTensor_(validXCorr2DRevptr)(real *r_,
real z = *k_++ * alpha;
for(ky = 0; ky < or; ky++) {
- THVector_(add)(po_, pi_, z, oc);
+ THVector_(cadd)(po_, po_, pi_, z, oc);
pi_ += ic;
po_ += oc;
}
diff --git a/lib/TH/generic/THTensorCopy.c b/lib/TH/generic/THTensorCopy.c
index 3d243e3..e909728 100644
--- a/lib/TH/generic/THTensorCopy.c
+++ b/lib/TH/generic/THTensorCopy.c
@@ -4,7 +4,18 @@
void THTensor_(copy)(THTensor *tensor, THTensor *src)
{
- TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;)
+ if (THTensor_(isContiguous)(tensor) && THTensor_(isContiguous)(src) && THTensor_(nElement)(tensor) == THTensor_(nElement)(src)) {
+ real *sp = THTensor_(data)(src);
+ real *rp = THTensor_(data)(tensor);
+ ptrdiff_t sz = THTensor_(nElement)(tensor);
+#ifndef TH_REAL_IS_HALF
+ THVector_(copy)(rp, sp, sz);
+#else
+ memcpy(rp, sp, sz * sizeof(real));
+#endif
+ } else {
+ TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;)
+ }
}
#define IMPLEMENT_THTensor_COPY(TYPENAMESRC, TYPE_SRC) \
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index d1e7420..b95d81f 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -2,18 +2,121 @@
#define TH_GENERIC_FILE "generic/THTensorMath.c"
#else
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
#define TH_OMP_OVERHEAD_THRESHOLD 100000
+#ifdef _OPENMP
+
+#ifndef _WIN32
+#define PRAGMA(P) _Pragma(#P)
+#else
+#define PRAGMA(P) __pragma(P)
+#endif
+
+#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
+{ \
+ ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR); \
+ PRAGMA(omp parallel if (TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD)) \
+ { \
+ size_t num_threads = omp_get_num_threads(); \
+ size_t tid = omp_get_thread_num(); \
+ ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
+ ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
+ TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
+ ptrdiff_t TENSOR##_len = TH_TENSOR_end - TH_TENSOR_offset; \
+ TYPE *TENSOR##_data = THTensor_(data)(TENSOR) + TH_TENSOR_offset; \
+ CODE \
+ } \
+}
+#else
+#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
+{ \
+ TYPE *TENSOR##_data = THTensor_(data)(TENSOR); \
+ ptrdiff_t TENSOR##_len = THTensor_(nElement)(TENSOR); \
+ CODE \
+}
+#endif
+
+#ifdef _OPENMP
+#define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+{ \
+ ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
+ PRAGMA(omp parallel if (TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD)) \
+ { \
+ size_t num_threads = omp_get_num_threads(); \
+ size_t tid = omp_get_thread_num(); \
+ ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
+ ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
+ TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
+ ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \
+ TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1) + TH_TENSOR_offset; \
+ TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2) + TH_TENSOR_offset; \
+ CODE \
+ } \
+}
+#else
+#define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+{ \
+ TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1); \
+ TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2); \
+ ptrdiff_t TENSOR1##_len = THTensor_(nElement)(TENSOR1); \
+ CODE \
+}
+#endif
+
+#ifdef _OPENMP
+#define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
+{ \
+ ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
+ PRAGMA(omp parallel if (TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD)) \
+ { \
+ size_t num_threads = omp_get_num_threads(); \
+ size_t tid = omp_get_thread_num(); \
+ ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
+ ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
+ TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
+ ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \
+ TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1) + TH_TENSOR_offset; \
+ TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2) + TH_TENSOR_offset; \
+ TYPE3 *TENSOR3##_data = THTensor_(data)(TENSOR3) + TH_TENSOR_offset; \
+ CODE \
+ } \
+}
+#else
+#define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
+{ \
+ TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1); \
+ TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2); \
+ TYPE3 *TENSOR3##_data = THTensor_(data)(TENSOR3); \
+ ptrdiff_t TENSOR1##_len = THTensor_(nElement)(TENSOR1); \
+ CODE \
+}
+#endif
+
void THTensor_(fill)(THTensor *r_, real value)
{
- TH_TENSOR_APPLY(real, r_,
- THVector_(fill)(r__data, value, r__size); break;);
+ if (THTensor_(isContiguous)(r_) || THTensor_(isTransposed)(r_)) {
+ TH_TENSOR_APPLY_CONTIG(real, r_, THVector_(fill)(r__data, value, r__len););
+ } else {
+ TH_TENSOR_APPLY(real, r_,
+ if (r__stride == 1) {
+ THVector_(fill)(r__data, value, r__size);
+ r__i = r__size;
+ r__data += r__stride * r__size;
+ break;
+ } else {
+ *r__data = value;
+ }
+ );
+ }
}
void THTensor_(zero)(THTensor *r_)
{
- TH_TENSOR_APPLY(real, r_,
- THVector_(fill)(r__data, 0, r__size); break;);
+ THTensor_(fill)(r_, 0);
}
void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, real value)
@@ -405,9 +508,17 @@ accreal THTensor_(dot)(THTensor *tensor, THTensor *src)
#undef th_isnan
#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
#define th_isnan(val) \
-if (isnan(value)) break;
+(isnan(val))
#else
-#define th_isnan(val)
+#define th_isnan(val) (0)
+#endif
+
+#undef th_isnan_break
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+#define th_isnan_break(val) \
+if (isnan(val)) break;
+#else
+#define th_isnan_break(val)
#endif
real THTensor_(minall)(THTensor *tensor)
@@ -423,7 +534,7 @@ real THTensor_(minall)(THTensor *tensor)
if(!(value >= theMin))
{
theMin = value;
- th_isnan(value)
+ th_isnan_break(value)
});
return theMin;
}
@@ -441,7 +552,7 @@ real THTensor_(maxall)(THTensor *tensor)
if(!(value <= theMax))
{
theMax = value;
- th_isnan(value)
+ th_isnan_break(value)
});
return theMax;
}
@@ -464,15 +575,9 @@ void THTensor_(add)(THTensor *r_, THTensor *t, real value)
{
THTensor_(resizeAs)(r_, t);
if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
- real *tp = THTensor_(data)(t);
- real *rp = THTensor_(data)(r_);
- ptrdiff_t sz = THTensor_(nElement)(t);
- ptrdiff_t i;
- #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
- for (i=0; i<sz; i++)
- rp[i] = tp[i] + value;
+ TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(adds)(r__data, t_data, value, r__len););
} else {
- TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;);
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;);
}
}
@@ -485,15 +590,9 @@ void THTensor_(mul)(THTensor *r_, THTensor *t, real value)
{
THTensor_(resizeAs)(r_, t);
if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
- real *tp = THTensor_(data)(t);
- real *rp = THTensor_(data)(r_);
- ptrdiff_t sz = THTensor_(nElement)(t);
- ptrdiff_t i;
- #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
- for (i=0; i<sz; i++)
- rp[i] = tp[i] * value;
+ TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(muls)(r__data, t_data, value, r__len););
} else {
- TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
}
}
@@ -501,22 +600,87 @@ void THTensor_(div)(THTensor *r_, THTensor *t, real value)
{
THTensor_(resizeAs)(r_, t);
if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+ TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(divs)(r__data, t_data, value, r__len););
+ } else {
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data / value;);
+ }
+}
+
+void THTensor_(lshift)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT)
+ return THTensor_(mul)(r_, t, powf(2, value));
+#elif defined(TH_REAL_IS_DOUBLE)
+ return THTensor_(mul)(r_, t, pow(2, value));
+#elif defined(TH_REAL_IS_HALF)
+ return THError("lshift is not supported for torch.HalfTensor");
+#else
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) &&
+ THTensor_(isContiguous)(t) &&
+ THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
real *tp = THTensor_(data)(t);
real *rp = THTensor_(data)(r_);
- ptrdiff_t sz = THTensor_(nElement)(t);
- ptrdiff_t i;
- #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
- for (i=0; i<sz; i++)
- rp[i] = tp[i] / value;
+ long sz = THTensor_(nElement)(t);
+ long i;
+ #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+ for (i=0; i<sz; i++) {
+#if defined(TH_REAL_IS_BYTE)
+ rp[i] = ((real) tp[i]) << value;
+#else
+ rp[i] = ((unsigned real) tp[i]) << value;
+#endif
+ }
} else {
- TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data / value;);
+#if defined(TH_REAL_IS_BYTE)
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((real) *t_data) << value););
+#else
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((unsigned real) *t_data) << value););
+#endif
+ }
+#endif
+}
+
+void THTensor_(rshift)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT)
+ return THTensor_(div)(r_, t, powf(2, value));
+#elif defined(TH_REAL_IS_DOUBLE)
+ return THTensor_(div)(r_, t, pow(2, value));
+#elif defined(TH_REAL_IS_HALF)
+ return THError("rshift is not supported for torch.HalfTensor");
+#else
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) &&
+ THTensor_(isContiguous)(t) &&
+ THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+ real *tp = THTensor_(data)(t);
+ real *rp = THTensor_(data)(r_);
+ long sz = THTensor_(nElement)(t);
+ long i;
+ #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+ for (i=0; i<sz; i++) {
+#if defined(TH_REAL_IS_BYTE)
+ rp[i] = ((real) tp[i]) >> value;
+#else
+ rp[i] = ((unsigned real) tp[i]) >> value;
+#endif
+ }
+ } else {
+#if defined(TH_REAL_IS_BYTE)
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((real) *t_data) >> value););
+#else
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((unsigned real) *t_data) >> value););
+#endif
}
+#endif
}
void THTensor_(fmod)(THTensor *r_, THTensor *t, real value)
{
THTensor_(resizeAs)(r_, t);
if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+
real *tp = THTensor_(data)(t);
real *rp = THTensor_(data)(r_);
ptrdiff_t sz = THTensor_(nElement)(t);
@@ -564,20 +728,89 @@ void THTensor_(remainder)(THTensor *r_, THTensor *t, real value)
}
}
-void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value)
+void THTensor_(bitand)(THTensor *r_, THTensor *t, real value)
{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+ return THError("bitand is only supported for integer type tensors");
+#else
THTensor_(resizeAs)(r_, t);
- if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+ if (THTensor_(isContiguous)(r_) &&
+ THTensor_(isContiguous)(t) &&
+ THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
real *tp = THTensor_(data)(t);
real *rp = THTensor_(data)(r_);
- /* real t_val; */
- ptrdiff_t sz = THTensor_(nElement)(t);
- ptrdiff_t i;
- #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
- for (i=0; i<sz; i++)
- rp[i] = (tp[i] < min_value) ? min_value : (tp[i] > max_value ? max_value : tp[i]);
+ long sz = THTensor_(nElement)(t);
+ long i;
+ #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+ for (i=0; i<sz; i++) {
+ rp[i] = tp[i] & value;
+ }
+ } else {
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data & value;);
+ }
+#endif
+}
+
+void THTensor_(bitor)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+ return THError("bitor is only supported for integer type tensors");
+#else
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) &&
+ THTensor_(isContiguous)(t) &&
+ THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+ real *tp = THTensor_(data)(t);
+ real *rp = THTensor_(data)(r_);
+ long sz = THTensor_(nElement)(t);
+ long i;
+ #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+ for (i=0; i<sz; i++) {
+ rp[i] = tp[i] | value;
+ }
+ } else {
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data | value;);
+ }
+#endif
+}
+
+void THTensor_(bitxor)(THTensor *r_, THTensor *t, real value)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+ return THError("bitxor is only supported for integer type tensors");
+#else
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) &&
+ THTensor_(isContiguous)(t) &&
+ THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+ real *tp = THTensor_(data)(t);
+ real *rp = THTensor_(data)(r_);
+ long sz = THTensor_(nElement)(t);
+ long i;
+ #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
+ for (i=0; i<sz; i++) {
+ rp[i] = tp[i] ^ value;
+ }
+ } else {
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data ^ value;);
+ }
+#endif
+}
+
+void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value)
+{
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+ real *tp = THTensor_(data)(t);
+ real *rp = THTensor_(data)(r_);
+ /* real t_val; */
+ ptrdiff_t sz = THTensor_(nElement)(t);
+ ptrdiff_t i;
+ #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+ for (i=0; i<sz; i++)
+ rp[i] = (tp[i] < min_value) ? min_value : (tp[i] > max_value ? max_value : tp[i]);
} else {
- TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
}
}
@@ -588,17 +821,10 @@ void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src)
if(r_ == t) {
THBlas_(axpy)(THTensor_(nElement)(t), value, THTensor_(data)(src), 1, THTensor_(data)(r_), 1);
} else {
- real *tp = THTensor_(data)(t);
- real *sp = THTensor_(data)(src);
- real *rp = THTensor_(data)(r_);
- ptrdiff_t sz = THTensor_(nElement)(t);
- ptrdiff_t i;
- #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
- for (i=0; i< sz; i++)
- rp[i] = tp[i] + value * sp[i];
+ TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cadd)(r__data, t_data, src_data, value, r__len););
}
} else {
- TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data + value * *src_data;);
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data + value * *src_data;);
}
}
@@ -611,16 +837,9 @@ void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src)
{
THTensor_(resizeAs)(r_, t);
if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
- real *tp = THTensor_(data)(t);
- real *sp = THTensor_(data)(src);
- real *rp = THTensor_(data)(r_);
- ptrdiff_t sz = THTensor_(nElement)(t);
- ptrdiff_t i;
- #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
- for (i=0; i<sz; i++)
- rp[i] = tp[i] * sp[i];
+ TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cmul)(r__data, t_data, src_data, r__len););
} else {
- TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * *src_data;);
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * *src_data;);
}
}
@@ -628,33 +847,106 @@ void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src)
{
THTensor_(resizeAs)(r_, t);
if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+ real *tp = THTensor_(data)(t);
+ real *sp = THTensor_(data)(src);
+ real *rp = THTensor_(data)(r_);
+ ptrdiff_t sz = THTensor_(nElement)(t);
+ ptrdiff_t i;
+ #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+ for (i=0; i<sz; i++)
+ rp[i] = pow(tp[i], sp[i]);
+ } else {
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = pow(*t_data, *src_data););
+ }
+}
+
+void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src)
+{
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+ TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cdiv)(r__data, t_data, src_data, r__len););
+ } else {
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / *src_data;);
+ }
+}
+
+void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_HALF)
+ return THError("clshift is not supported for torch.HalfTensor");
+#endif
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) &&
+ THTensor_(isContiguous)(t) &&
+ THTensor_(isContiguous)(src) &&
+ THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
real *tp = THTensor_(data)(t);
real *sp = THTensor_(data)(src);
real *rp = THTensor_(data)(r_);
ptrdiff_t sz = THTensor_(nElement)(t);
ptrdiff_t i;
#pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
- for (i=0; i<sz; i++)
- rp[i] = pow(tp[i], sp[i]);
+ for (i=0; i<sz; i++) {
+#if defined(TH_REAL_IS_FLOAT)
+ rp[i] = tp[i] * powf(2, sp[i]);
+#elif defined(TH_REAL_IS_DOUBLE)
+ rp[i] = tp[i] * pow(2, sp[i]);
+#elif defined(TH_REAL_IS_BYTE)
+ rp[i] = ((real) tp[i]) << sp[i];
+#else
+ rp[i] = ((unsigned real) tp[i]) << sp[i];
+#endif
+ }
} else {
- TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = pow(*t_data, *src_data););
+#if defined(TH_REAL_IS_FLOAT)
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * powf(2, *src_data););
+#elif defined(TH_REAL_IS_DOUBLE)
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * pow(2, *src_data););
+#elif defined(TH_REAL_IS_BYTE)
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((real)*t_data) << *src_data;);
+#else
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((unsigned real)*t_data) << *src_data;);
+#endif
}
}
-void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src)
+void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src)
{
+#if defined(TH_REAL_IS_HALF)
+ return THError("crshift is not supported for torch.HalfTensor");
+#endif
THTensor_(resizeAs)(r_, t);
- if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+ if (THTensor_(isContiguous)(r_) &&
+ THTensor_(isContiguous)(t) &&
+ THTensor_(isContiguous)(src) &&
+ THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
real *tp = THTensor_(data)(t);
real *sp = THTensor_(data)(src);
real *rp = THTensor_(data)(r_);
ptrdiff_t sz = THTensor_(nElement)(t);
ptrdiff_t i;
#pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
- for (i=0; i<sz; i++)
- rp[i] = tp[i] / sp[i];
+ for (i=0; i<sz; i++) {
+#if defined(TH_REAL_IS_FLOAT)
+ rp[i] = tp[i] / powf(2, sp[i]);
+#elif defined(TH_REAL_IS_DOUBLE)
+ rp[i] = tp[i] / pow(2, sp[i]);
+#elif defined(TH_REAL_IS_BYTE)
+ rp[i] = ((real) tp[i]) >> sp[i];
+#else
+ rp[i] = ((unsigned real) tp[i]) >> sp[i];
+#endif
+ }
} else {
- TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / *src_data;);
+#if defined(TH_REAL_IS_FLOAT)
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / powf(2, *src_data););
+#elif defined(TH_REAL_IS_DOUBLE)
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / pow(2, *src_data););
+#elif defined(TH_REAL_IS_BYTE)
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((real)*t_data) >> *src_data;);
+#else
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((unsigned real)*t_data) >> *src_data;);
+#endif
}
}
@@ -713,19 +1005,94 @@ void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src)
}
}
-void THTensor_(tpow)(THTensor *r_, real value, THTensor *t)
+void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src)
{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+ return THError("cbitand is only supported for integer type tensors");
+#else
THTensor_(resizeAs)(r_, t);
- if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+ if (THTensor_(isContiguous)(r_) &&
+ THTensor_(isContiguous)(t) &&
+ THTensor_(isContiguous)(src) &&
+ THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+ real *tp = THTensor_(data)(t);
+ real *sp = THTensor_(data)(src);
+ real *rp = THTensor_(data)(r_);
+ ptrdiff_t sz = THTensor_(nElement)(t);
+ ptrdiff_t i;
+ #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+ for (i=0; i<sz; i++) {
+ rp[i] = tp[i] & sp[i];
+ }
+ } else {
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data & *src_data;);
+ }
+#endif
+}
+
+void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+ return THError("cbitor is only supported for integer type tensors");
+#else
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) &&
+ THTensor_(isContiguous)(t) &&
+ THTensor_(isContiguous)(src) &&
+ THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+ real *tp = THTensor_(data)(t);
+ real *sp = THTensor_(data)(src);
+ real *rp = THTensor_(data)(r_);
+ ptrdiff_t sz = THTensor_(nElement)(t);
+ ptrdiff_t i;
+ #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+ for (i=0; i<sz; i++) {
+ rp[i] = tp[i] | sp[i];
+ }
+ } else {
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data | *src_data;);
+ }
+#endif
+}
+
+void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+ return THError("cbitxor is only supported for integer type tensors");
+#else
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) &&
+ THTensor_(isContiguous)(t) &&
+ THTensor_(isContiguous)(src) &&
+ THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
real *tp = THTensor_(data)(t);
+ real *sp = THTensor_(data)(src);
real *rp = THTensor_(data)(r_);
ptrdiff_t sz = THTensor_(nElement)(t);
ptrdiff_t i;
#pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
- for (i=0; i<sz; i++)
- rp[i] = pow(value, tp[i]);
+ for (i=0; i<sz; i++) {
+ rp[i] = tp[i] ^ sp[i];
+ }
+ } else {
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data ^ *src_data;);
+ }
+#endif
+}
+
+void THTensor_(tpow)(THTensor *r_, real value, THTensor *t)
+{
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+ real *tp = THTensor_(data)(t);
+ real *rp = THTensor_(data)(r_);
+ ptrdiff_t sz = THTensor_(nElement)(t);
+ ptrdiff_t i;
+ #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+ for (i=0; i<sz; i++)
+ rp[i] = pow(value, tp[i]);
} else {
- TH_TENSOR_APPLY2(real, r_, real, t, *r__data = pow(value, *t_data););
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = pow(value, *t_data););
}
}
@@ -1110,10 +1477,6 @@ ptrdiff_t THTensor_(numel)(THTensor *t)
void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
{
THLongStorage *dim;
- real theMax;
- real value;
- long theIndex;
- long i;
THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
dimension + TH_INDEX_BASE);
@@ -1124,32 +1487,67 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
THLongTensor_resize(indices_, dim, NULL);
THLongStorage_free(dim);
- TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
- theMax = t_data[0];
- theIndex = 0;
+ // two implementations optimized for data locality
+ if (t->stride[dimension] == 1) {
+ real theMax;
+ real value;
+ long theIndex;
+ long i;
+ TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
+ theMax = t_data[0];
+ theIndex = 0;
- for(i = 0; i < t_size; i++)
- {
- value = t_data[i*t_stride];
- /* This is not the same as value>theMax in the case of NaNs */
- if(!(value <= theMax))
+ for(i = 0; i < t_size; i++)
{
- theIndex = i;
- theMax = value;
- th_isnan(value)
+ value = t_data[i*t_stride];
+ /* This is not the same as value>theMax in the case of NaNs */
+ if(!(value <= theMax))
+ {
+ theIndex = i;
+ theMax = value;
+ th_isnan_break(value)
+ }
}
- }
- *indices__data = theIndex;
- *values__data = theMax;);
+ *indices__data = theIndex;
+ *values__data = theMax;);
+ } else {
+ if (THTensor_(nDimension)(t) > 1) {
+ THTensor *t0 = THTensor_(newSelect)(t, dimension, 0);
+ THTensor_(copy)(values_, t0);
+ THTensor_(free)(t0);
+ } else {
+ THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
+ }
+ THLongTensor_zero(indices_);
+
+ if(t->size[dimension] == 1) {
+ return;
+ }
+
+ THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
+ // tempValues_.expand_as(t)
+ tempValues_->size[dimension] = t->size[dimension];
+ tempValues_->stride[dimension] = 0;
+
+ THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
+ // tempIndices_.expand_as(t)
+ tempIndices_->size[dimension] = t->size[dimension];
+ tempIndices_->stride[dimension] = 0;
+
+ TH_TENSOR_APPLY3_D(real, t, real, tempValues_, long, tempIndices_, dimension,
+ if(!(*t_data <= *tempValues__data) && !th_isnan(*tempValues__data)) {
+ *tempValues__data = *t_data;
+ *tempIndices__data = *tempIndices__dimOffset;
+ });
+
+ THTensor_(free)(tempValues_);
+ THLongTensor_free(tempIndices_);
+ }
}
void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
{
THLongStorage *dim;
- real theMin;
- real value;
- long theIndex;
- long i;
THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
dimension + TH_INDEX_BASE);
@@ -1160,23 +1558,59 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
THLongTensor_resize(indices_, dim, NULL);
THLongStorage_free(dim);
- TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
- theMin = t_data[0];
- theIndex = 0;
+ // two implementations optimized for data locality
+ if (t->stride[dimension] == 1) {
+ real theMax;
+ real value;
+ long theIndex;
+ long i;
+ TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
+ theMax = t_data[0];
+ theIndex = 0;
- for(i = 0; i < t_size; i++)
- {
- value = t_data[i*t_stride];
- /* This is not the same as value<theMin in the case of NaNs */
- if(!(value >= theMin))
+ for(i = 0; i < t_size; i++)
{
- theIndex = i;
- theMin = value;
- th_isnan(value)
+ value = t_data[i*t_stride];
+ /* This is not the same as value>theMax in the case of NaNs */
+ if(!(value >= theMax))
+ {
+ theIndex = i;
+ theMax = value;
+ th_isnan_break(value)
+ }
}
- }
- *indices__data = theIndex;
- *values__data = theMin;);
+ *indices__data = theIndex;
+ *values__data = theMax;);
+ } else {
+ if (THTensor_(nDimension)(t) > 1) {
+ THTensor *t0 = THTensor_(newSelect)(t, dimension, 0);
+ THTensor_(copy)(values_, t0);
+ THTensor_(free)(t0);
+ } else {
+ THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
+ }
+ THLongTensor_zero(indices_);
+
+ if(t->size[dimension] == 1) {
+ return;
+ }
+
+ THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
+ // tempValues_.expand_as(t)
+ tempValues_->size[dimension] = t->size[dimension];
+ tempValues_->stride[dimension] = 0;
+
+ THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
+ // tempIndices_.expand_as(t)
+ tempIndices_->size[dimension] = t->size[dimension];
+ tempIndices_->stride[dimension] = 0;
+
+ TH_TENSOR_APPLY3_D(real, t, real, tempValues_, long, tempIndices_, dimension,
+ if(!(*t_data >= *tempValues__data) && !th_isnan(*tempValues__data)) {
+ *tempValues__data = *t_data;
+ *tempIndices__data = *tempIndices__dimOffset;
+ });
+ }
}
@@ -1192,12 +1626,24 @@ void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension)
THTensor_(resize)(r_, dim, NULL);
THLongStorage_free(dim);
- TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
- accreal sum = 0;
- long i;
- for(i = 0; i < t_size; i++)
- sum += t_data[i*t_stride];
- *r__data = (real)sum;);
+ // two implementations optimized for data locality
+ if (t->stride[dimension] == 1) {
+ TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+ accreal sum = 0;
+ long i;
+ for(i = 0; i < t_size; i++)
+ sum += t_data[i*t_stride];
+ *r__data = (real)sum;);
+ } else {
+ THTensor_(zero)(r_);
+ THTensor *temp_ = THTensor_(newWithTensor)(r_);
+ // r_.expand_as(t)
+ temp_->size[dimension] = t->size[dimension];
+ temp_->stride[dimension] = 0;
+
+ TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data + *t_data;);
+ THTensor_(free)(temp_);
+ }
}
void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension)
@@ -1212,13 +1658,24 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension)
THTensor_(resize)(r_, dim, NULL);
THLongStorage_free(dim);
- TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
- accreal prod = 1;
- long i;
- for(i = 0; i < t_size; i++)
- prod *= t_data[i*t_stride];
- *r__data = (real)prod;);
+ // two implementations optimized for data locality
+ if (t->stride[dimension] == 1) {
+ TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+ accreal prod = 1;
+ long i;
+ for(i = 0; i < t_size; i++)
+ prod *= t_data[i*t_stride];
+ *r__data = (real)prod;);
+ } else {
+ THTensor_(fill)(r_, 1);
+ THTensor *temp_ = THTensor_(newWithTensor)(r_);
+ // r_.expand_as(t)
+ temp_->size[dimension] = t->size[dimension];
+ temp_->stride[dimension] = 0;
+ TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data * *t_data;);
+ THTensor_(free)(temp_);
+ }
}
void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension)
@@ -1262,13 +1719,13 @@ void THTensor_(sign)(THTensor *r_, THTensor *t)
#if defined (TH_REAL_IS_BYTE)
TH_TENSOR_APPLY2(real, r_, real, t,
- if (*t_data > 0) *r__data = 1;
- else *r__data = 0;);
+ if (*t_data > 0) *r__data = 1;
+ else *r__data = 0;);
#else
TH_TENSOR_APPLY2(real, r_, real, t,
- if (*t_data > 0) *r__data = 1;
- else if (*t_data < 0) *r__data = -1;
- else *r__data = 0;);
+ if (*t_data > 0) *r__data = 1;
+ else if (*t_data < 0) *r__data = -1;
+ else *r__data = 0;);
#endif
}
@@ -1534,75 +1991,75 @@ static void THTensor_(quicksortascend)(real *arr, long *idx, long elements, long
while(!done) {
/* Use median of three for pivot choice */
- P=(L+R)>>1;
- BOTH_SWAP(P, L+1);
- if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
- if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
- if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
+ P=(L+R)>>1;
+ BOTH_SWAP(P, L+1);
+ if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
+ if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
+ if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
- i = L+1; j = R; piv = ARR(L); pid = IDX(L);
+ i = L+1; j = R; piv = ARR(L); pid = IDX(L);
- do {
- do { i = i+1; } while(ARR(i) < piv);
- do { j = j-1; } while(ARR(j) > piv);
- if (j < i)
- break;
- BOTH_SWAP(i, j);
- } while(1);
- BOTH_SWAP(L, j);
- /* Left subfile is (L, j-1) */
- /* Right subfile is (i, R) */
- sz_left = j-L;
- sz_right = R-i+1;
- if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
- /* both subfiles are small */
- /* if stack empty */
- if (stack == 0) {
- done = 1;
- } else {
- stack--;
- L = beg[stack];
- R = end[stack];
- }
- } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
- /* exactly one of the subfiles is small */
- /* (L,R) = large subfile */
- if (sz_left > sz_right) {
- /* Implicit: L = L; */
- R = j-1;
- } else {
- L = i;
- /* Implicit: R = R; */
- }
+ do {
+ do { i = i+1; } while(ARR(i) < piv);
+ do { j = j-1; } while(ARR(j) > piv);
+ if (j < i)
+ break;
+ BOTH_SWAP(i, j);
+ } while(1);
+ BOTH_SWAP(L, j);
+ /* Left subfile is (L, j-1) */
+ /* Right subfile is (i, R) */
+ sz_left = j-L;
+ sz_right = R-i+1;
+ if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
+ /* both subfiles are small */
+ /* if stack empty */
+ if (stack == 0) {
+ done = 1;
+ } else {
+ stack--;
+ L = beg[stack];
+ R = end[stack];
+ }
+ } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
+ /* exactly one of the subfiles is small */
+ /* (L,R) = large subfile */
+ if (sz_left > sz_right) {
+ /* Implicit: L = L; */
+ R = j-1;
} else {
- /* none of the subfiles is small */
- /* push large subfile */
- /* (L,R) = small subfile */
- if (sz_left > sz_right) {
- beg[stack] = L;
- end[stack] = j-1;
- stack++;
- L = i;
- /* Implicit: R = R */
- } else {
- beg[stack] = i;
- end[stack] = R;
- stack++;
- /* Implicit: L = L; */
- R = j-1;
- }
+ L = i;
+ /* Implicit: R = R; */
}
+ } else {
+ /* none of the subfiles is small */
+ /* push large subfile */
+ /* (L,R) = small subfile */
+ if (sz_left > sz_right) {
+ beg[stack] = L;
+ end[stack] = j-1;
+ stack++;
+ L = i;
+ /* Implicit: R = R */
+ } else {
+ beg[stack] = i;
+ end[stack] = R;
+ stack++;
+ /* Implicit: L = L; */
+ R = j-1;
+ }
+ }
} /* while not done */
/* Now insertion sort on the concatenation of subfiles */
for(i=elements-2; i>=0; i--) {
if (ARR(i) > ARR(i+1)) {
- piv = ARR(i);
+ piv = ARR(i);
pid = IDX(i);
j = i+1;
do {
- ARR(j-1) = ARR(j);
- IDX(j-1) = IDX(j);
- j = j+1;
+ ARR(j-1) = ARR(j);
+ IDX(j-1) = IDX(j);
+ j = j+1;
} while(j < elements && ARR(j) < piv);
ARR(j-1) = piv;
IDX(j-1) = pid;
@@ -1623,75 +2080,75 @@ static void THTensor_(quicksortdescend)(real *arr, long *idx, long elements, lon
while(!done) {
/* Use median of three for pivot choice */
- P=(L+R)>>1;
- BOTH_SWAP(P, L+1);
- if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
- if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
- if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
+ P=(L+R)>>1;
+ BOTH_SWAP(P, L+1);
+ if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
+ if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
+ if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
- i = L+1; j = R; piv = ARR(L); pid = IDX(L);
+ i = L+1; j = R; piv = ARR(L); pid = IDX(L);
- do {
- do { i = i+1; } while(ARR(i) > piv);
- do { j = j-1; } while(ARR(j) < piv);
- if (j < i)
- break;
- BOTH_SWAP(i, j);
- } while(1);
- BOTH_SWAP(L, j);
- /* Left subfile is (L, j-1) */
- /* Right subfile is (i, R) */
- sz_left = j-L;
- sz_right = R-i+1;
- if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
- /* both subfiles are small */
- /* if stack empty */
- if (stack == 0) {
- done = 1;
- } else {
- stack--;
- L = beg[stack];
- R = end[stack];
- }
- } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
- /* exactly one of the subfiles is small */
- /* (L,R) = large subfile */
- if (sz_left > sz_right) {
- /* Implicit: L = L; */
- R = j-1;
- } else {
- L = i;
- /* Implicit: R = R; */
- }
+ do {
+ do { i = i+1; } while(ARR(i) > piv);
+ do { j = j-1; } while(ARR(j) < piv);
+ if (j < i)
+ break;
+ BOTH_SWAP(i, j);
+ } while(1);
+ BOTH_SWAP(L, j);
+ /* Left subfile is (L, j-1) */
+ /* Right subfile is (i, R) */
+ sz_left = j-L;
+ sz_right = R-i+1;
+ if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
+ /* both subfiles are small */
+ /* if stack empty */
+ if (stack == 0) {
+ done = 1;
+ } else {
+ stack--;
+ L = beg[stack];
+ R = end[stack];
+ }
+ } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
+ /* exactly one of the subfiles is small */
+ /* (L,R) = large subfile */
+ if (sz_left > sz_right) {
+ /* Implicit: L = L; */
+ R = j-1;
+ } else {
+ L = i;
+ /* Implicit: R = R; */
+ }
+ } else {
+ /* none of the subfiles is small */
+ /* push large subfile */
+ /* (L,R) = small subfile */
+ if (sz_left > sz_right) {
+ beg[stack] = L;
+ end[stack] = j-1;
+ stack++;
+ L = i;
+ /* Implicit: R = R */
} else {
- /* none of the subfiles is small */
- /* push large subfile */
- /* (L,R) = small subfile */
- if (sz_left > sz_right) {
- beg[stack] = L;
- end[stack] = j-1;
- stack++;
- L = i;
- /* Implicit: R = R */
- } else {
- beg[stack] = i;
- end[stack] = R;
- stack++;
- /* Implicit: L = L; */
- R = j-1;
- }
+ beg[stack] = i;
+ end[stack] = R;
+ stack++;
+ /* Implicit: L = L; */
+ R = j-1;
}
+ }
} /* while not done */
/* Now insertion sort on the concatenation of subfiles */
for(i=elements-2; i>=0; i--) {
if (ARR(i) < ARR(i+1)) {
- piv = ARR(i);
+ piv = ARR(i);
pid = IDX(i);
j = i+1;
do {
- ARR(j-1) = ARR(j);
- IDX(j-1) = IDX(j);
- j = j+1;
+ ARR(j-1) = ARR(j);
+ IDX(j-1) = IDX(j);
+ j = j+1;
} while(j < elements && ARR(j) > piv);
ARR(j-1) = piv;
IDX(j-1) = pid;
@@ -2044,7 +2501,9 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
int maxDim = dimension + 1;
int allEmpty = 1;
int allContiguous = 1;
- int ldimension = dimension;
+
+ // cat_dimension is the actual dimension we cat along
+ int cat_dimension = dimension;
for (i = 0; i < numInputs; i++)
{
@@ -2053,13 +2512,13 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
// When the user input dimension is -1 (i.e. -2 in C)
// Then we pick the maximum last dimension across all tensors.
- if ( dimension == -2 )
+ if ( dimension + TH_INDEX_BASE == -1 )
{
- ldimension = maxDim?(maxDim-1):0;
+ cat_dimension = maxDim?(maxDim-1):0;
}
THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
- THArgCheck(ldimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
+ THArgCheck(cat_dimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
size = THLongStorage_newWithSize(maxDim);
@@ -2067,7 +2526,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
{
// dimSize is either the size of the dim if it exists, either 1 if #dim > 0, otherwise 0
long dimSize = i < inputs[0]->nDimension ? inputs[0]->size[i] : THMin(inputs[0]->nDimension, 1);
- if (i == ldimension)
+ if (i == cat_dimension)
{
for (j = 1; j < numInputs; j++)
{
@@ -2114,7 +2573,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
// First path is for contiguous inputs along dim 1
// Second path for non-contiguous
- if (ldimension == 0 && allContiguous)
+ if (cat_dimension == 0 && allContiguous)
{
real* result_data = result->storage->data + result->storageOffset;
offset = 0;
@@ -2137,9 +2596,9 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
{
if (inputs[j]->nDimension)
{
- long dimSize = ldimension < inputs[j]->nDimension ? inputs[j]->size[ldimension] : 1;
+ long dimSize = cat_dimension < inputs[j]->nDimension ? inputs[j]->size[cat_dimension] : 1;
THTensor *nt = THTensor_(newWithTensor)(result);
- THTensor_(narrow)(nt, NULL, ldimension, offset, dimSize);
+ THTensor_(narrow)(nt, NULL, cat_dimension, offset, dimSize);
THTensor_(copy)(nt, inputs[j]);
THTensor_(free)(nt);
offset += dimSize;
@@ -2178,25 +2637,25 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP) \
void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, real value) \
{ \
- THByteTensor_rawResize(r_, t->nDimension, t->size, NULL); \
+ THByteTensor_resizeNd(r_, t->nDimension, t->size, NULL); \
TH_TENSOR_APPLY2(unsigned char, r_, real, t, \
*r__data = (*t_data OP value) ? 1 : 0;); \
} \
void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, real value) \
{ \
- THTensor_(rawResize)(r_, t->nDimension, t->size, NULL); \
+ THTensor_(resizeNd)(r_, t->nDimension, t->size, NULL); \
TH_TENSOR_APPLY2(real, r_, real, t, \
*r__data = (*t_data OP value) ? 1 : 0;); \
} \
void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
{ \
- THByteTensor_rawResize(r_, ta->nDimension, ta->size, NULL); \
+ THByteTensor_resizeNd(r_, ta->nDimension, ta->size, NULL); \
TH_TENSOR_APPLY3(unsigned char, r_, real, ta, real, tb, \
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
} \
void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
{ \
- THTensor_(rawResize)(r_, ta->nDimension, ta->size, NULL); \
+ THTensor_(resizeNd)(r_, ta->nDimension, ta->size, NULL); \
TH_TENSOR_APPLY3(real, r_, real, ta, real, tb, \
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
} \
@@ -2290,22 +2749,11 @@ void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight)
void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension)
{
- THLongStorage *dim;
-
THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d",
dimension + TH_INDEX_BASE);
- dim = THTensor_(newSizeOf)(t);
- THLongStorage_set(dim, dimension, 1);
- THTensor_(resize)(r_, dim, NULL);
- THLongStorage_free(dim);
-
- TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
- accreal sum = 0;
- long i;
- for(i = 0; i < t_size; i++)
- sum += t_data[i*t_stride];
- *r__data = (real)sum/t_size;);
+ THTensor_(sum)(r_, t, dimension);
+ THTensor_(div)(r_, r_, t->size[dimension]);
}
void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag)
@@ -2491,7 +2939,7 @@ accreal THTensor_(dist)(THTensor *tensor, THTensor *src, real value)
{
real sum = 0;
TH_TENSOR_APPLY2(real, tensor, real, src,
- sum += pow(fabs(*tensor_data - *src_data), value);)
+ sum += pow(fabs(*tensor_data - *src_data), value);)
return pow(sum, 1.0/value);
}
diff --git a/lib/TH/generic/THTensorMath.h b/lib/TH/generic/THTensorMath.h
index c656dfd..ff994ed 100644
--- a/lib/TH/generic/THTensorMath.h
+++ b/lib/TH/generic/THTensorMath.h
@@ -34,17 +34,27 @@ TH_API void THTensor_(add)(THTensor *r_, THTensor *t, real value);
TH_API void THTensor_(sub)(THTensor *self, THTensor *src, real value);
TH_API void THTensor_(mul)(THTensor *r_, THTensor *t, real value);
TH_API void THTensor_(div)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(lshift)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(rshift)(THTensor *r_, THTensor *t, real value);
TH_API void THTensor_(fmod)(THTensor *r_, THTensor *t, real value);
TH_API void THTensor_(remainder)(THTensor *r_, THTensor *t, real value);
TH_API void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value);
+TH_API void THTensor_(bitand)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(bitor)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(bitxor)(THTensor *r_, THTensor *t, real value);
TH_API void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src);
TH_API void THTensor_(csub)(THTensor *self, THTensor *src1, real value, THTensor *src2);
TH_API void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src);
TH_API void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src);
TH_API void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src);
TH_API void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src);
TH_API void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src);
TH_API void THTensor_(addcmul)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2);
TH_API void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2);
diff --git a/lib/TH/generic/THVector.h b/lib/TH/generic/THVector.h
index 67fdcfa..7d36854 100644
--- a/lib/TH/generic/THVector.h
+++ b/lib/TH/generic/THVector.h
@@ -3,10 +3,13 @@
#else
TH_API void THVector_(fill)(real *x, const real c, const ptrdiff_t n);
-TH_API void THVector_(add)(real *y, const real *x, const real c, const ptrdiff_t n);
-TH_API void THVector_(diff)(real *z, const real *x, const real *y, const ptrdiff_t n);
-TH_API void THVector_(scale)(real *y, const real c, const ptrdiff_t n);
-TH_API void THVector_(mul)(real *y, const real *x, const ptrdiff_t n);
+TH_API void THVector_(cadd)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n);
+TH_API void THVector_(adds)(real *y, const real *x, const real c, const ptrdiff_t n);
+TH_API void THVector_(cmul)(real *z, const real *x, const real *y, const ptrdiff_t n);
+TH_API void THVector_(muls)(real *y, const real *x, const real c, const ptrdiff_t n);
+TH_API void THVector_(cdiv)(real *z, const real *x, const real *y, const ptrdiff_t n);
+TH_API void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n);
+TH_API void THVector_(copy)(real *y, const real *x, const ptrdiff_t n);
/* Initialize the dispatch pointers */
TH_API void THVector_(vectorDispatchInit)(void);
diff --git a/lib/TH/generic/THVectorDefault.c b/lib/TH/generic/THVectorDefault.c
index aabc16c..3388e0d 100644
--- a/lib/TH/generic/THVectorDefault.c
+++ b/lib/TH/generic/THVectorDefault.c
@@ -2,10 +2,25 @@
#define TH_GENERIC_FILE "generic/THVectorDefault.c"
#else
+void THVector_(copy_DEFAULT)(real *x, const real *y, const ptrdiff_t n) {
+ ptrdiff_t i = 0;
+
+ for(; i <n-4; i+=4)
+ {
+ x[i] = y[i];
+ x[i+1] = y[i+1];
+ x[i+2] = y[i+2];
+ x[i+3] = y[i+3];
+ }
+
+ for(; i < n; i++)
+ x[i] = y[i];
+}
+
void THVector_(fill_DEFAULT)(real *x, const real c, const ptrdiff_t n) {
ptrdiff_t i = 0;
- for(; i < n-4; i += 4)
+ for(; i <n-4; i+=4)
{
x[i] = c;
x[i+1] = c;
@@ -17,68 +32,100 @@ void THVector_(fill_DEFAULT)(real *x, const real c, const ptrdiff_t n) {
x[i] = c;
}
-void THVector_(add_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
+void THVector_(cadd_DEFAULT)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n)
+{
+ ptrdiff_t i = 0;
+
+ for(; i<n-4; i+=4)
+ {
+ z[i] = x[i] + c * y[i];
+ z[i+1] = x[i+1] + c * y[i+1];
+ z[i+2] = x[i+2] + c * y[i+2];
+ z[i+3] = x[i+3] + c * y[i+3];
+ }
+
+ for(; i<n; i++)
+ z[i] = x[i] + c * y[i];
+}
+
+void THVector_(adds_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
+{
+ ptrdiff_t i = 0;
+
+ for(; i<n-4; i+=4)
+ {
+ y[i] = x[i] + c;
+ y[i+1] = x[i+1] + c;
+ y[i+2] = x[i+2] + c;
+ y[i+3] = x[i+3] + c;
+ }
+
+ for(; i<n; i++)
+ y[i] = x[i] + c;
+}
+
+void THVector_(cmul_DEFAULT)(real *z, const real *x, const real *y, const ptrdiff_t n)
{
ptrdiff_t i = 0;
- for(;i < n-4; i += 4)
+ for(; i <n-4; i+=4)
{
- y[i] += c * x[i];
- y[i+1] += c * x[i+1];
- y[i+2] += c * x[i+2];
- y[i+3] += c * x[i+3];
+ z[i] = x[i] * y[i];
+ z[i+1] = x[i+1] * y[i+1];
+ z[i+2] = x[i+2] * y[i+2];
+ z[i+3] = x[i+3] * y[i+3];
}
for(; i < n; i++)
- y[i] += c * x[i];
+ z[i] = x[i] * y[i];
}
-void THVector_(diff_DEFAULT)(real *z, const real *x, const real *y, const ptrdiff_t n)
+void THVector_(muls_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
{
ptrdiff_t i = 0;
- for(; i < n-4; i += 4)
+ for(; i <n-4; i+=4)
{
- z[i] = x[i] - y[i];
- z[i+1] = x[i+1] - y[i+1];
- z[i+2] = x[i+2] - y[i+2];
- z[i+3] = x[i+3] - y[i+3];
+ y[i] = x[i] * c;
+ y[i+1] = x[i+1] * c;
+ y[i+2] = x[i+2] * c;
+ y[i+3] = x[i+3] * c;
}
for(; i < n; i++)
- z[i] = x[i] - y[i];
+ y[i] = x[i] * c;
}
-void THVector_(scale_DEFAULT)(real *y, const real c, const ptrdiff_t n)
+void THVector_(cdiv_DEFAULT)(real *z, const real *x, const real *y, const ptrdiff_t n)
{
ptrdiff_t i = 0;
- for(; i < n-4; i +=4)
+ for(; i<n-4; i+=4)
{
- y[i] *= c;
- y[i+1] *= c;
- y[i+2] *= c;
- y[i+3] *= c;
+ z[i] = x[i] / y[i];
+ z[i+1] = x[i+1] / y[i+1];
+ z[i+2] = x[i+2] / y[i+2];
+ z[i+3] = x[i+3] / y[i+3];
}
for(; i < n; i++)
- y[i] *= c;
+ z[i] = x[i] / y[i];
}
-void THVector_(mul_DEFAULT)(real *y, const real *x, const ptrdiff_t n)
+void THVector_(divs_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
{
ptrdiff_t i = 0;
- for(; i < n-4; i += 4)
+ for(; i<n-4; i+=4)
{
- y[i] *= x[i];
- y[i+1] *= x[i+1];
- y[i+2] *= x[i+2];
- y[i+3] *= x[i+3];
+ y[i] = x[i] / c;
+ y[i+1] = x[i+1] / c;
+ y[i+2] = x[i+2] / c;
+ y[i+3] = x[i+3] / c;
}
for(; i < n; i++)
- y[i] *= x[i];
+ y[i] = x[i] / c;
}
#endif
diff --git a/lib/TH/generic/THVectorDispatch.c b/lib/TH/generic/THVectorDispatch.c
index 2436a12..5b88852 100644
--- a/lib/TH/generic/THVectorDispatch.c
+++ b/lib/TH/generic/THVectorDispatch.c
@@ -26,6 +26,12 @@ static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = {
#endif
#endif
+ #if defined(USE_AVX)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(fill_AVX), SIMDExtension_AVX),
+ #endif
+ #endif
+
#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
|| defined(USE_SSE4_1) || defined(USE_SSE4_2)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
@@ -38,115 +44,199 @@ void THVector_(fill)(real *x, const real c, const ptrdiff_t n) {
THVector_(fill_DISPATCHPTR)(x, c, n);
}
-static void (*THVector_(add_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(add_DEFAULT);
-static FunctionDescription THVector_(add_DISPATCHTABLE)[] = {
+static void (*THVector_(cadd_DISPATCHPTR))(real *, const real *, const real *, const real, const ptrdiff_t) = &THVector_(cadd_DEFAULT);
+static FunctionDescription THVector_(cadd_DISPATCHTABLE)[] = {
#if defined(__NEON__)
#if defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(add_NEON), SIMDExtension_NEON),
+ FUNCTION_IMPL(THVector_(cadd_NEON), SIMDExtension_NEON),
#endif
#endif
- #if defined(__PPC64__)
+ #if defined(USE_AVX2)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(add_VSX), SIMDExtension_VSX),
+ FUNCTION_IMPL(THVector_(cadd_AVX2), SIMDExtension_AVX2),
+ #endif
+ #endif
+
+ #if defined(USE_AVX)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(cadd_AVX), SIMDExtension_AVX),
#endif
#endif
#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
|| defined(USE_SSE4_1) || defined(USE_SSE4_2)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(add_SSE), SIMDExtension_SSE),
+ FUNCTION_IMPL(THVector_(cadd_SSE), SIMDExtension_SSE),
#endif
#endif
- FUNCTION_IMPL(THVector_(add_DEFAULT), SIMDExtension_DEFAULT)
+ FUNCTION_IMPL(THVector_(cadd_DEFAULT), SIMDExtension_DEFAULT)
};
-void THVector_(add)(real *y, const real *x, const real c, const ptrdiff_t n) {
- THVector_(add_DISPATCHPTR)(y, x, c, n);
+void THVector_(cadd)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n) {
+ THVector_(cadd_DISPATCHPTR)(z, x, y, c, n);
}
-
-static void (*THVector_(diff_DISPATCHPTR))(real *, const real *, const real *, const ptrdiff_t) = &THVector_(diff_DEFAULT);
-static FunctionDescription THVector_(diff_DISPATCHTABLE)[] = {
+static void (*THVector_(adds_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(adds_DEFAULT);
+static FunctionDescription THVector_(adds_DISPATCHTABLE)[] = {
#if defined(__NEON__)
#if defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(diff_NEON), SIMDExtension_NEON),
+ FUNCTION_IMPL(THVector_(adds_NEON), SIMDExtension_NEON),
#endif
#endif
#if defined(__PPC64__)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(diff_VSX), SIMDExtension_VSX),
+ FUNCTION_IMPL(THVector_(adds_VSX), SIMDExtension_VSX),
+ #endif
+ #endif
+
+ #if defined(USE_AVX)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(adds_AVX), SIMDExtension_AVX),
#endif
#endif
#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
|| defined(USE_SSE4_1) || defined(USE_SSE4_2)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(diff_SSE), SIMDExtension_SSE),
+ FUNCTION_IMPL(THVector_(adds_SSE), SIMDExtension_SSE),
#endif
#endif
- FUNCTION_IMPL(THVector_(diff_DEFAULT), SIMDExtension_DEFAULT)
+ FUNCTION_IMPL(THVector_(adds_DEFAULT), SIMDExtension_DEFAULT)
};
-void THVector_(diff)(real *z, const real *x, const real *y, const ptrdiff_t n) {
- THVector_(diff_DISPATCHPTR)(z, x, y, n);
+// Dispatch stubs that just call the pointers
+TH_API void THVector_(adds)(real *r_, const real *t, const real value, const ptrdiff_t n) {
+ THVector_(adds_DISPATCHPTR)(r_, t, value, n);
}
+static void (*THVector_(cmul_DISPATCHPTR))(real *, const real *, const real *, const ptrdiff_t) = &THVector_(cmul_DEFAULT);
+static FunctionDescription THVector_(cmul_DISPATCHTABLE)[] = {
+ #if defined(__NEON__)
+ #if defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(cmul_NEON), SIMDExtension_NEON),
+ #endif
+ #endif
+
+ #if defined(USE_AVX)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(cmul_AVX), SIMDExtension_AVX),
+ #endif
+ #endif
+
+ #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+ || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(cmul_SSE), SIMDExtension_SSE),
+ #endif
+ #endif
+
+ FUNCTION_IMPL(THVector_(cmul_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(cmul)(real *z, const real *x, const real *y, const ptrdiff_t n) {
+ THVector_(cmul_DISPATCHPTR)(z, x, y, n);
+}
-static void (*THVector_(scale_DISPATCHPTR))(real *, const real, const ptrdiff_t) = &THVector_(scale_DEFAULT);
-static FunctionDescription THVector_(scale_DISPATCHTABLE)[] = {
+static void (*THVector_(muls_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(muls_DEFAULT);
+static FunctionDescription THVector_(muls_DISPATCHTABLE)[] = {
#if defined(__NEON__)
#if defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(scale_NEON), SIMDExtension_NEON),
+ FUNCTION_IMPL(THVector_(muls_NEON), SIMDExtension_NEON),
#endif
#endif
#if defined(__PPC64__)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(scale_VSX), SIMDExtension_VSX),
+ FUNCTION_IMPL(THVector_(muls_VSX), SIMDExtension_VSX),
+ #endif
+ #endif
+
+ #if defined(USE_AVX)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(muls_AVX), SIMDExtension_AVX),
#endif
#endif
#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
|| defined(USE_SSE4_1) || defined(USE_SSE4_2)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(scale_SSE), SIMDExtension_SSE),
+ FUNCTION_IMPL(THVector_(muls_SSE), SIMDExtension_SSE),
#endif
#endif
- FUNCTION_IMPL(THVector_(scale_DEFAULT), SIMDExtension_DEFAULT)
+ FUNCTION_IMPL(THVector_(muls_DEFAULT), SIMDExtension_DEFAULT)
};
-TH_API void THVector_(scale)(real *y, const real c, const ptrdiff_t n) {
- THVector_(scale_DISPATCHPTR)(y, c, n);
+void THVector_(muls)(real *y, const real *x, const real c, const ptrdiff_t n) {
+ THVector_(muls_DISPATCHPTR)(y, x, c, n);
}
+static void (*THVector_(cdiv_DISPATCHPTR))(real *, const real *, const real *, const ptrdiff_t) = &THVector_(cdiv_DEFAULT);
+static FunctionDescription THVector_(cdiv_DISPATCHTABLE)[] = {
+ #if defined(__NEON__)
+ #if defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(cdiv_NEON), SIMDExtension_NEON),
+ #endif
+ #endif
+
+ #if defined(USE_AVX)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(cdiv_AVX), SIMDExtension_AVX),
+ #endif
+ #endif
+
+ #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+ || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(cdiv_SSE), SIMDExtension_SSE),
+ #endif
+ #endif
+
+ FUNCTION_IMPL(THVector_(cdiv_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(cdiv)(real *z, const real *x, const real *y, const ptrdiff_t n) {
+ THVector_(cdiv_DISPATCHPTR)(z, x, y, n);
+}
-static void (*THVector_(mul_DISPATCHPTR))(real *, const real *, const ptrdiff_t) = &THVector_(mul_DEFAULT);
-static FunctionDescription THVector_(mul_DISPATCHTABLE)[] = {
+static void (*THVector_(divs_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(divs_DEFAULT);
+static FunctionDescription THVector_(divs_DISPATCHTABLE)[] = {
#if defined(__NEON__)
#if defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(mul_NEON), SIMDExtension_NEON),
+ FUNCTION_IMPL(THVector_(divs_NEON), SIMDExtension_NEON),
#endif
#endif
- #if defined(__PPC64__)
+ #if defined(USE_AVX)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(mul_VSX), SIMDExtension_VSX),
+ FUNCTION_IMPL(THVector_(divs_AVX), SIMDExtension_AVX),
#endif
#endif
#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
|| defined(USE_SSE4_1) || defined(USE_SSE4_2)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
- FUNCTION_IMPL(THVector_(mul_SSE), SIMDExtension_SSE),
+ FUNCTION_IMPL(THVector_(divs_SSE), SIMDExtension_SSE),
+ #endif
+ #endif
+
+ FUNCTION_IMPL(THVector_(divs_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n) {
+ THVector_(divs_DISPATCHPTR)(y, x, c, n);
+}
+
+static void (*THVector_(copy_DISPATCHPTR))(real *, const real *, const ptrdiff_t) = &THVector_(copy_DEFAULT);
+static FunctionDescription THVector_(copy_DISPATCHTABLE)[] = {
+ #if defined(USE_AVX)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(copy_AVX), SIMDExtension_AVX),
#endif
#endif
- FUNCTION_IMPL(THVector_(mul_DEFAULT), SIMDExtension_DEFAULT)
+ FUNCTION_IMPL(THVector_(copy_DEFAULT), SIMDExtension_DEFAULT)
};
-void THVector_(mul)(real *y, const real *x, const ptrdiff_t n) {
- THVector_(mul_DISPATCHPTR);
+void THVector_(copy)(real *y, const real *x, const ptrdiff_t n) {
+ THVector_(copy_DISPATCHPTR)(y, x, n);
}
/* This needs to be called in order to initialize the dispatch pointers at runtime.
@@ -160,10 +250,13 @@ void THVector_(vectorDispatchInit)(void)
{
uint32_t hostSimdExts = detectHostSIMDExtensions();
INIT_DISPATCH_PTR(fill);
- INIT_DISPATCH_PTR(add);
- INIT_DISPATCH_PTR(diff);
- INIT_DISPATCH_PTR(scale);
- INIT_DISPATCH_PTR(mul);
+ INIT_DISPATCH_PTR(cadd);
+ INIT_DISPATCH_PTR(adds);
+ INIT_DISPATCH_PTR(cmul);
+ INIT_DISPATCH_PTR(muls);
+ INIT_DISPATCH_PTR(cdiv);
+ INIT_DISPATCH_PTR(divs);
+ INIT_DISPATCH_PTR(copy);
}
#endif
diff --git a/lib/TH/generic/simd/convolve.c b/lib/TH/generic/simd/convolve.c
index 842af17..bf07bbe 100644
--- a/lib/TH/generic/simd/convolve.c
+++ b/lib/TH/generic/simd/convolve.c
@@ -1,4 +1,4 @@
-#if defined(USE_AVX)
+#if defined(__AVX__)
#ifdef _MSC_VER
#include <intrin.h>
@@ -113,7 +113,7 @@ void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows,
void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
void convolve_5x5(float* output, float* input, float* kernel, long outRows, long outCols, long inCols) {
-#if defined(USE_AVX)
+#if defined(__AVX__)
int avx = haveCPUFeature(kCPUFeature_AVX);
if (avx)
{
@@ -124,4 +124,4 @@ void convolve_5x5(float* output, float* input, float* kernel, long outRows, long
{
convolve_5x5_sse(output, input, kernel, outRows, outCols, outCols, inCols);
}
-}
\ No newline at end of file
+}
diff --git a/lib/TH/generic/simd/simd.h b/lib/TH/generic/simd/simd.h
index aa3b722..19d41b1 100644
--- a/lib/TH/generic/simd/simd.h
+++ b/lib/TH/generic/simd/simd.h
@@ -2,14 +2,15 @@
#define TH_SIMD_INC
#include <stdint.h>
+#include <stdlib.h>
#if defined(_MSC_VER)
#include <intrin.h>
-#elif defined(HAVE_GCC_GET_CPUID)
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
#include <cpuid.h>
#endif
// Can be found on Intel ISA Reference for CPUID
-#define CPUID_AVX2_BIT 0x10 // Bit 5 of EBX for EAX=0x7
+#define CPUID_AVX2_BIT 0x20 // Bit 5 of EBX for EAX=0x7
#define CPUID_AVX_BIT 0x10000000 // Bit 28 of ECX for EAX=0x1
#define CPUID_SSE_BIT 0x2000000 // bit 25 of EDX for EAX=0x1
@@ -99,13 +100,13 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *
*ebx = cpuInfo[1];
*ecx = cpuInfo[2];
*edx = cpuInfo[3];
-#elif defined(HAVE_GCC_GET_CPUID)
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
uint32_t level = *eax;
__get_cpuid (level, eax, ebx, ecx, edx);
#else
- uint32_t a = *eax, b, c, d;
+ uint32_t a = *eax, b, c = *ecx, d;
asm volatile ( "cpuid\n\t"
- : "+a"(a), "=b"(b), "=c"(c), "=d"(d) );
+ : "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
*eax = a;
*ebx = b;
*ecx = c;
@@ -117,19 +118,38 @@ static inline uint32_t detectHostSIMDExtensions()
{
uint32_t eax, ebx, ecx, edx;
uint32_t hostSimdExts = 0x0;
+ int TH_NO_AVX = 1, TH_NO_AVX2 = 1, TH_NO_SSE = 1;
+ char *evar;
+
+ evar = getenv("TH_NO_AVX2");
+ if (evar == NULL || strncmp(evar, "1", 2) != 0)
+ TH_NO_AVX2 = 0;
// Check for AVX2. Requires separate CPUID
eax = 0x7;
+ ecx = 0x0;
cpuid(&eax, &ebx, &ecx, &edx);
- if (ebx & CPUID_AVX2_BIT)
+ if ((ebx & CPUID_AVX2_BIT) && TH_NO_AVX2 == 0) {
hostSimdExts |= SIMDExtension_AVX2;
+ }
+ // Detect and enable AVX and SSE
eax = 0x1;
cpuid(&eax, &ebx, &ecx, &edx);
- if (ecx & CPUID_AVX_BIT)
+
+ evar = getenv("TH_NO_AVX");
+ if (evar == NULL || strncmp(evar, "1", 2) != 0)
+ TH_NO_AVX = 0;
+ if (ecx & CPUID_AVX_BIT && TH_NO_AVX == 0) {
hostSimdExts |= SIMDExtension_AVX;
- if (edx & CPUID_SSE_BIT)
+ }
+
+ evar = getenv("TH_NO_SSE");
+ if (evar == NULL || strncmp(evar, "1", 2) != 0)
+ TH_NO_SSE = 0;
+ if (edx & CPUID_SSE_BIT && TH_NO_SSE == 0) {
hostSimdExts |= SIMDExtension_SSE;
+ }
return hostSimdExts;
}
diff --git a/lib/TH/vector/AVX.c b/lib/TH/vector/AVX.c
new file mode 100644
index 0000000..b7d5dd1
--- /dev/null
+++ b/lib/TH/vector/AVX.c
@@ -0,0 +1,274 @@
+#if defined(__AVX__)
+#ifndef _MSC_VER
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif
+
+#include "AVX.h"
+
+void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n) {
+ ptrdiff_t i;
+ ptrdiff_t off;
+ for (i=0; i<=((n)-8); i+=8) {
+ _mm256_storeu_pd(y+i, _mm256_loadu_pd(x+i));
+ _mm256_storeu_pd(y+i+4, _mm256_loadu_pd(x+i+4));
+ }
+ off = (n) - ((n)%8);
+ for (i=0; i<((n)%8); i++) {
+ y[off+i] = x[off+i];
+ }
+}
+
+void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ ptrdiff_t off;
+ __m256d YMM0 = _mm256_set_pd(c, c, c, c);
+ for (i=0; i<=((n)-16); i+=16) {
+ _mm256_storeu_pd((x)+i , YMM0);
+ _mm256_storeu_pd((x)+i+4, YMM0);
+ _mm256_storeu_pd((x)+i+8, YMM0);
+ _mm256_storeu_pd((x)+i+12, YMM0);
+ }
+ off = (n) - ((n)%16);
+ for (i=0; i<((n)%16); i++) {
+ x[off+i] = c;
+ }
+}
+
+void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256d YMM0, YMM1, YMM2, YMM3;
+ for (i=0; i<=((n)-8); i+=8) {
+ YMM0 = _mm256_loadu_pd(x+i);
+ YMM1 = _mm256_loadu_pd(x+i+4);
+ YMM2 = _mm256_loadu_pd(y+i);
+ YMM3 = _mm256_loadu_pd(y+i+4);
+ YMM2 = _mm256_div_pd(YMM0, YMM2);
+ YMM3 = _mm256_div_pd(YMM1, YMM3);
+ _mm256_storeu_pd(z+i, YMM2);
+ _mm256_storeu_pd(z+i+4, YMM3);
+ }
+ for (; i<(n); i++) {
+ z[i] = x[i] / y[i];
+ }
+}
+
+void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+ __m256d YMM0, YMM1;
+ for (i=0; i<=((n)-8); i+=8) {
+ YMM0 = _mm256_loadu_pd(x+i);
+ YMM1 = _mm256_loadu_pd(x+i+4);
+ YMM0 = _mm256_div_pd(YMM0, YMM15);
+ YMM1 = _mm256_div_pd(YMM1, YMM15);
+ _mm256_storeu_pd(y+i, YMM0);
+ _mm256_storeu_pd(y+i+4, YMM1);
+ }
+ for (; i<(n); i++) {
+ y[i] = x[i] / c;
+ }
+}
+
+void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256d YMM0, YMM1, YMM2, YMM3;
+ for (i=0; i<=((n)-8); i+=8) {
+ YMM0 = _mm256_loadu_pd(x+i);
+ YMM1 = _mm256_loadu_pd(x+i+4);
+ YMM2 = _mm256_loadu_pd(y+i);
+ YMM3 = _mm256_loadu_pd(y+i+4);
+ YMM2 = _mm256_mul_pd(YMM0, YMM2);
+ YMM3 = _mm256_mul_pd(YMM1, YMM3);
+ _mm256_storeu_pd(z+i, YMM2);
+ _mm256_storeu_pd(z+i+4, YMM3);
+ }
+ for (; i<n; i++) {
+ z[i] = x[i] * y[i];
+ }
+}
+
+void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+ __m256d YMM0, YMM1;
+ for (i=0; i<=((n)-8); i+=8) {
+ YMM0 = _mm256_loadu_pd(x+i);
+ YMM1 = _mm256_loadu_pd(x+i+4);
+ YMM0 = _mm256_mul_pd(YMM0, YMM15);
+ YMM1 = _mm256_mul_pd(YMM1, YMM15);
+ _mm256_storeu_pd(y+i, YMM0);
+ _mm256_storeu_pd(y+i+4, YMM1);
+ }
+ for (; i<n; i++) {
+ y[i] = x[i] * c;
+ }
+}
+
+void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+ __m256d YMM0, YMM1, YMM2, YMM3;
+ for (i=0; i<=((n)-4); i+=4) {
+ YMM0 = _mm256_loadu_pd(y+i);
+ YMM1 = _mm256_loadu_pd(x+i);
+ YMM2 = _mm256_mul_pd(YMM0, YMM15);
+ YMM3 = _mm256_add_pd(YMM1, YMM2);
+ _mm256_storeu_pd(z+i, YMM3);
+ }
+ for (; i<(n); i++) {
+ z[i] = x[i] + y[i] * c;
+ }
+}
+
+void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+ __m256d YMM0, YMM1;
+ for (i=0; i<=((n)-8); i+=8) {
+ YMM0 = _mm256_loadu_pd(x+i);
+ YMM1 = _mm256_loadu_pd(x+i+4);
+ YMM0 = _mm256_add_pd(YMM0, YMM15);
+ YMM1 = _mm256_add_pd(YMM1, YMM15);
+ _mm256_storeu_pd(y+i, YMM0);
+ _mm256_storeu_pd(y+i+4, YMM1);
+ }
+ for (; i<(n); i++) {
+ y[i] = x[i] + c;
+ }
+}
+
+void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n) {
+ ptrdiff_t i;
+ ptrdiff_t off;
+ for (i=0; i<=((n)-16); i+=16) {
+ _mm256_storeu_ps(y+i, _mm256_loadu_ps(x+i));
+ _mm256_storeu_ps(y+i+8, _mm256_loadu_ps(x+i+8));
+ }
+ off = (n) - ((n)%16);
+ for (i=0; i<((n)%16); i++) {
+ y[off+i] = x[off+i];
+ }
+}
+
+void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ ptrdiff_t off;
+ __m256 YMM0 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+ for (i=0; i<=((n)-32); i+=32) {
+ _mm256_storeu_ps((x)+i , YMM0);
+ _mm256_storeu_ps((x)+i+8, YMM0);
+ _mm256_storeu_ps((x)+i+16, YMM0);
+ _mm256_storeu_ps((x)+i+24, YMM0);
+ }
+ off = (n) - ((n)%32);
+ for (i=0; i<((n)%32); i++) {
+ x[off+i] = c;
+ }
+}
+
+void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256 YMM0, YMM1, YMM2, YMM3;
+ for (i=0; i<=((n)-16); i+=16) {
+ YMM0 = _mm256_loadu_ps(x+i);
+ YMM1 = _mm256_loadu_ps(x+i+8);
+ YMM2 = _mm256_loadu_ps(y+i);
+ YMM3 = _mm256_loadu_ps(y+i+8);
+ YMM2 = _mm256_div_ps(YMM0, YMM2);
+ YMM3 = _mm256_div_ps(YMM1, YMM3);
+ _mm256_storeu_ps(z+i, YMM2);
+ _mm256_storeu_ps(z+i+8, YMM3);
+ }
+ for (; i<(n); i++) {
+ z[i] = x[i] / y[i];
+ }
+}
+
+void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+ __m256 YMM0, YMM1;
+ for (i=0; i<=((n)-16); i+=16) {
+ YMM0 = _mm256_loadu_ps(x+i);
+ YMM1 = _mm256_loadu_ps(x+i+8);
+ YMM0 = _mm256_div_ps(YMM0, YMM15);
+ YMM1 = _mm256_div_ps(YMM1, YMM15);
+ _mm256_storeu_ps(y+i, YMM0);
+ _mm256_storeu_ps(y+i+8, YMM1);
+ }
+ for (; i<(n); i++) {
+ y[i] = x[i] / c;
+ }
+}
+
+void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256 YMM0, YMM1, YMM2, YMM3;
+ for (i=0; i<=((n)-16); i+=16) {
+ YMM0 = _mm256_loadu_ps(x+i);
+ YMM1 = _mm256_loadu_ps(x+i+8);
+ YMM2 = _mm256_loadu_ps(y+i);
+ YMM3 = _mm256_loadu_ps(y+i+8);
+ YMM2 = _mm256_mul_ps(YMM0, YMM2);
+ YMM3 = _mm256_mul_ps(YMM1, YMM3);
+ _mm256_storeu_ps(z+i, YMM2);
+ _mm256_storeu_ps(z+i+8, YMM3);
+ }
+ for (; i<n; i++) {
+ z[i] = x[i] * y[i];
+ }
+}
+
+void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+ __m256 YMM0, YMM1;
+ for (i=0; i<=((n)-16); i+=16) {
+ YMM0 = _mm256_loadu_ps(x+i);
+ YMM1 = _mm256_loadu_ps(x+i+8);
+ YMM0 = _mm256_mul_ps(YMM0, YMM15);
+ YMM1 = _mm256_mul_ps(YMM1, YMM15);
+ _mm256_storeu_ps(y+i, YMM0);
+ _mm256_storeu_ps(y+i+8, YMM1);
+ }
+ for (; i<n; i++) {
+ y[i] = x[i] * c;
+ }
+}
+
+void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+ __m256 YMM0, YMM1, YMM2, YMM3;
+ for (i=0; i<=((n)-8); i+=8) {
+ YMM0 = _mm256_loadu_ps(y+i);
+ YMM1 = _mm256_loadu_ps(x+i);
+ YMM2 = _mm256_mul_ps(YMM0, YMM15);
+ YMM3 = _mm256_add_ps(YMM1, YMM2);
+ _mm256_storeu_ps(z+i, YMM3);
+ }
+ for (; i<(n); i++) {
+ z[i] = x[i] + y[i] * c;
+ }
+}
+
+void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+ __m256 YMM0, YMM1;
+ for (i=0; i<=((n)-16); i+=16) {
+ YMM0 = _mm256_loadu_ps(x+i);
+ YMM1 = _mm256_loadu_ps(x+i+8);
+ YMM0 = _mm256_add_ps(YMM0, YMM15);
+ YMM1 = _mm256_add_ps(YMM1, YMM15);
+ _mm256_storeu_ps(y+i, YMM0);
+ _mm256_storeu_ps(y+i+8, YMM1);
+ }
+ for (; i<(n); i++) {
+ y[i] = x[i] + c;
+ }
+}
+
+#endif // defined(__AVX__)
diff --git a/lib/TH/vector/AVX.h b/lib/TH/vector/AVX.h
new file mode 100644
index 0000000..bfaeaa6
--- /dev/null
+++ b/lib/TH/vector/AVX.h
@@ -0,0 +1,23 @@
+#ifndef TH_AVX_H
+#define TH_AVX_H
+
+#include <stddef.h>
+
+void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
+void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n);
+void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n);
+void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n);
+void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+
+#endif
diff --git a/lib/TH/vector/AVX2.c b/lib/TH/vector/AVX2.c
new file mode 100644
index 0000000..082a680
--- /dev/null
+++ b/lib/TH/vector/AVX2.c
@@ -0,0 +1,47 @@
+#if defined(__AVX2__)
+#ifndef _MSC_VER
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif
+#include "AVX2.h"
+
+void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256d YMM15 = _mm256_set_pd(c, c, c, c);
+ __m256d YMM0, YMM1, YMM2, YMM3;
+ for (i=0; i<=((n)-8); i+=8) {
+ YMM0 = _mm256_loadu_pd(y+i);
+ YMM1 = _mm256_loadu_pd(y+i+4);
+ YMM2 = _mm256_loadu_pd(x+i);
+ YMM3 = _mm256_loadu_pd(x+i+4);
+ YMM2 = _mm256_fmadd_pd(YMM0, YMM15, YMM2);
+ YMM3 = _mm256_fmadd_pd(YMM1, YMM15, YMM3);
+ _mm256_storeu_pd(z+i, YMM2);
+ _mm256_storeu_pd(z+i+4, YMM3);
+ }
+ for (; i<(n); i++) {
+ z[i] = x[i] + y[i] * c;
+ }
+}
+
+void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
+ __m256 YMM0, YMM1, YMM2, YMM3;
+ for (i=0; i<=((n)-16); i+=16) {
+ YMM0 = _mm256_loadu_ps(y+i);
+ YMM1 = _mm256_loadu_ps(y+i+8);
+ YMM2 = _mm256_loadu_ps(x+i);
+ YMM3 = _mm256_loadu_ps(x+i+8);
+ YMM2 = _mm256_fmadd_ps(YMM0, YMM15, YMM2);
+ YMM3 = _mm256_fmadd_ps(YMM1, YMM15, YMM3);
+ _mm256_storeu_ps(z+i, YMM2);
+ _mm256_storeu_ps(z+i+8, YMM3);
+ }
+ for (; i<(n); i++) {
+ z[i] = x[i] + y[i] * c;
+ }
+}
+
+#endif // defined(__AVX2__)
diff --git a/lib/TH/vector/AVX2.h b/lib/TH/vector/AVX2.h
new file mode 100644
index 0000000..85a9e93
--- /dev/null
+++ b/lib/TH/vector/AVX2.h
@@ -0,0 +1,9 @@
+#ifndef TH_AVX2_H
+#define TH_AVX2_H
+
+#include <stddef.h>
+
+void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+
+#endif
diff --git a/lib/TH/vector/NEON.c b/lib/TH/vector/NEON.c
index 327b006..7920fb1 100644
--- a/lib/TH/vector/NEON.c
+++ b/lib/TH/vector/NEON.c
@@ -14,65 +14,92 @@ static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n)
}
-
-static void THFloatVector_diff_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
+static void THFloatVector_cmul_NEON(float *z, const float *x, const float* y, const ptrdiff_t n) {
long i = 0;
for(; i < n-4; i += 4)
{
- z[i] = x[i] - y[i];
- z[i+1] = x[i+1] - y[i+1];
- z[i+2] = x[i+2] - y[i+2];
- z[i+3] = x[i+3] - y[i+3];
+ z[i] = x[i] * y[i];
+ z[i+1] = x[i+1] * y[i+1];
+ z[i+2] = x[i+2] * y[i+2];
+ z[i+3] = x[i+3] * y[i+3];
}
for(; i < n; i++)
- z[i] = x[i] - y[i];
+ z[i] = x[i] * y[i];
+}
+
+static void THFloatVector_muls_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
+ long i = 0;
+
+ for(; i < n-4; i += 4)
+ {
+ y[i] = x[i] * c;
+ y[i+1] = x[i+1] * c;
+ y[i+2] = x[i+2] * c;
+ y[i+3] = x[i+3] * c;
+ }
+ for(; i < n; i++)
+ y[i] = x[i] * c;
}
+static void THFloatVector_cadd_NEON(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+ long i = 0;
+
+ for(;i < n-4; i += 4)
+ {
+ z[i] = x[i] + c * y[i];
+ z[i+1] = x[i+1] + c * y[i+1];
+ z[i+2] = x[i+2] + c * y[i+2];
+ z[i+3] = x[i+3] + c * y[i+3];
+ }
+
+ for(; i < n; i++)
+ z[i] = x[i] + c * y[i];
+}
-static void THFloatVector_scale_NEON(float *y, const float c, const ptrdiff_t n) {
+static void THFloatVector_adds_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
long i = 0;
- for(; i < n-4; i +=4)
+ for(;i < n-4; i += 4)
{
- y[i] *= c;
- y[i+1] *= c;
- y[i+2] *= c;
- y[i+3] *= c;
+ y[i] = x[i] + c;
+ y[i+1] = x[i+1] + c;
+ y[i+2] = x[i+2] + c;
+ y[i+3] = x[i+3] + c;
}
for(; i < n; i++)
- y[i] *= c;
+ y[i] = x[i] + c;
}
-static void THFloatVector_mul_NEON(float *y, const float *x, const ptrdiff_t n) {
+static void THFloatVector_cdiv_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
long i = 0;
- for(; i < n-4; i += 4)
+ for(;i < n-4; i += 4)
{
- y[i] *= x[i];
- y[i+1] *= x[i+1];
- y[i+2] *= x[i+2];
- y[i+3] *= x[i+3];
+ z[i] = x[i] / y[i];
+ z[i+1] = x[i+1] / y[i+1];
+ z[i+2] = x[i+2] / y[i+2];
+ z[i+3] = x[i+3] / y[i+3];
}
for(; i < n; i++)
- y[i] *= x[i];
+ z[i] = x[i] / y[i];
}
-static void THFloatVector_add_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
+static void THFloatVector_divs_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
long i = 0;
for(;i < n-4; i += 4)
{
- y[i] += c * x[i];
- y[i+1] += c * x[i+1];
- y[i+2] += c * x[i+2];
- y[i+3] += c * x[i+3];
+ y[i] = x[i] / c;
+ y[i+1] = x[i+1] / c;
+ y[i+2] = x[i+2] / c;
+ y[i+3] = x[i+3] / c;
}
for(; i < n; i++)
- y[i] += c * x[i];
+ y[i] = x[i] / c;
}
diff --git a/lib/TH/vector/SSE.c b/lib/TH/vector/SSE.c
index 781b037..d026935 100644
--- a/lib/TH/vector/SSE.c
+++ b/lib/TH/vector/SSE.c
@@ -4,7 +4,6 @@
#include <intrin.h>
#endif
-
static void THDoubleVector_fill_SSE(double *x, const double c, const ptrdiff_t n) {
ptrdiff_t i;
ptrdiff_t off;
@@ -21,70 +20,40 @@ static void THDoubleVector_fill_SSE(double *x, const double c, const ptrdiff_t n
}
}
-
-static void THDoubleVector_add_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
- ptrdiff_t i = 0;
+static void THDoubleVector_cadd_SSE(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
+ ptrdiff_t i;
__m128d XMM7 = _mm_set1_pd(c);
- __m128d XMM0,XMM2;
- for (; i<=((n)-2); i+=2) {
+ __m128d XMM0, XMM2;
+ for (i=0; i<=((n)-2); i+=2) {
XMM0 = _mm_loadu_pd((x)+i);
XMM2 = _mm_loadu_pd((y)+i);
- XMM0 = _mm_mul_pd(XMM0, XMM7);
- XMM2 = _mm_add_pd(XMM2, XMM0);
- _mm_storeu_pd((y)+i , XMM2);
+ XMM2 = _mm_mul_pd(XMM2, XMM7);
+ XMM2 = _mm_add_pd(XMM0, XMM2);
+ _mm_storeu_pd((z)+i, XMM2);
}
for (; i<(n); i++) {
- y[i] += c * x[i];
- }
-}
-
-
-static void THDoubleVector_diff_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
- ptrdiff_t i;
- for (i=0; i<=((n)-8); i+=8) {
- __m128d XMM0 = _mm_loadu_pd((x)+i );
- __m128d XMM1 = _mm_loadu_pd((x)+i+2);
- __m128d XMM2 = _mm_loadu_pd((x)+i+4);
- __m128d XMM3 = _mm_loadu_pd((x)+i+6);
- __m128d XMM4 = _mm_loadu_pd((y)+i );
- __m128d XMM5 = _mm_loadu_pd((y)+i+2);
- __m128d XMM6 = _mm_loadu_pd((y)+i+4);
- __m128d XMM7 = _mm_loadu_pd((y)+i+6);
- XMM0 = _mm_sub_pd(XMM0, XMM4);
- XMM1 = _mm_sub_pd(XMM1, XMM5);
- XMM2 = _mm_sub_pd(XMM2, XMM6);
- XMM3 = _mm_sub_pd(XMM3, XMM7);
- _mm_storeu_pd((z)+i , XMM0);
- _mm_storeu_pd((z)+i+2, XMM1);
- _mm_storeu_pd((z)+i+4, XMM2);
- _mm_storeu_pd((z)+i+6, XMM3);
- }
- ptrdiff_t off = (n) - ((n)%8);
- for (i=0; i<((n)%8); i++) {
- z[off+i] = x[off+i] - y[off+i];
+ z[i] = x[i] + c * y[i];
}
}
-
-static void THDoubleVector_scale_SSE(double *y, const double c, const ptrdiff_t n) {
+static void THDoubleVector_adds_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
ptrdiff_t i;
__m128d XMM7 = _mm_set1_pd(c);
+ __m128d XMM0, XMM2;
for (i=0; i<=((n)-4); i+=4) {
- __m128d XMM0 = _mm_loadu_pd((y)+i );
- __m128d XMM1 = _mm_loadu_pd((y)+i+2);
- XMM0 = _mm_mul_pd(XMM0, XMM7);
- XMM1 = _mm_mul_pd(XMM1, XMM7);
- _mm_storeu_pd((y)+i , XMM0);
- _mm_storeu_pd((y)+i+2, XMM1);
+ XMM0 = _mm_loadu_pd((x)+i);
+ XMM2 = _mm_loadu_pd((x)+i+2);
+ XMM0 = _mm_add_pd(XMM0, XMM7);
+ XMM2 = _mm_add_pd(XMM2, XMM7);
+ _mm_storeu_pd((y)+i, XMM0);
+ _mm_storeu_pd((y)+i+2, XMM2);
}
- ptrdiff_t off = (n) - ((n)%4);
- for (i=0; i<((n)%4); i++) {
- y[off+i] *= c;
+ for (; i<(n); i++) {
+ y[i] = x[i] + c;
}
}
-
-static void THDoubleVector_mul_SSE(double *y, const double *x, const ptrdiff_t n) {
+static void THDoubleVector_cmul_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
ptrdiff_t i;
for (i=0; i<=((n)-8); i+=8) {
__m128d XMM0 = _mm_loadu_pd((x)+i );
@@ -99,17 +68,72 @@ static void THDoubleVector_mul_SSE(double *y, const double *x, const ptrdiff_t n
XMM5 = _mm_mul_pd(XMM5, XMM1);
XMM6 = _mm_mul_pd(XMM6, XMM2);
XMM7 = _mm_mul_pd(XMM7, XMM3);
+ _mm_storeu_pd((z)+i , XMM4);
+ _mm_storeu_pd((z)+i+2, XMM5);
+ _mm_storeu_pd((z)+i+4, XMM6);
+ _mm_storeu_pd((z)+i+6, XMM7);
+ }
+ for (; i<(n); i++) {
+ z[i] = x[i] * y[i];
+ }
+}
+
+static void THDoubleVector_muls_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m128d XMM15 = _mm_set1_pd(c);
+ for (i=0; i<=((n)-8); i+=8) {
+ __m128d XMM0 = _mm_loadu_pd((x)+i );
+ __m128d XMM1 = _mm_loadu_pd((x)+i+2);
+ __m128d XMM2 = _mm_loadu_pd((x)+i+4);
+ __m128d XMM3 = _mm_loadu_pd((x)+i+6);
+ __m128d XMM4 = _mm_mul_pd(XMM15, XMM0);
+ __m128d XMM5 = _mm_mul_pd(XMM15, XMM1);
+ __m128d XMM6 = _mm_mul_pd(XMM15, XMM2);
+ __m128d XMM7 = _mm_mul_pd(XMM15, XMM3);
_mm_storeu_pd((y)+i , XMM4);
_mm_storeu_pd((y)+i+2, XMM5);
_mm_storeu_pd((y)+i+4, XMM6);
_mm_storeu_pd((y)+i+6, XMM7);
}
- ptrdiff_t off = (n) - ((n)%8);
- for (i=0; i<((n)%8); i++) {
- y[off+i] *= x[off+i];
+ for (; i<(n); i++) {
+ y[i] = x[i] * c;
+ }
+}
+
+static void THDoubleVector_cdiv_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m128d XMM0, XMM1, XMM2, XMM3;
+ for (i=0; i<=((n)-4); i+=4) {
+ XMM0 = _mm_loadu_pd(x+i);
+ XMM1 = _mm_loadu_pd(x+i+2);
+ XMM2 = _mm_loadu_pd(y+i);
+ XMM3 = _mm_loadu_pd(y+i+2);
+ XMM2 = _mm_div_pd(XMM0, XMM2);
+ XMM3 = _mm_div_pd(XMM1, XMM3);
+ _mm_storeu_pd(z+i, XMM2);
+ _mm_storeu_pd(z+i+2, XMM3);
+ }
+ for (; i<(n); i++) {
+ z[i] = x[i] / y[i];
}
}
+static void THDoubleVector_divs_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m128d XMM7 = _mm_set1_pd(c);
+ __m128d XMM0, XMM1;
+ for (i=0; i<=((n)-4); i+=4) {
+ XMM0 = _mm_loadu_pd(x+i);
+ XMM1 = _mm_loadu_pd(x+i+2);
+ XMM0 = _mm_div_pd(XMM0, XMM7);
+ XMM1 = _mm_div_pd(XMM1, XMM7);
+ _mm_storeu_pd(y+i, XMM0);
+ _mm_storeu_pd(y+i+2, XMM1);
+ }
+ for (; i<(n); i++) {
+ y[i] = x[i] / c;
+ }
+}
static void THFloatVector_fill_SSE(float *x, const float c, const ptrdiff_t n) {
ptrdiff_t i;
@@ -128,24 +152,40 @@ static void THFloatVector_fill_SSE(float *x, const float c, const ptrdiff_t n) {
}
-static void THFloatVector_add_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
- ptrdiff_t i = 0;
+static void THFloatVector_cadd_SSE(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
+ ptrdiff_t i;
__m128 XMM7 = _mm_set_ps1(c);
- __m128 XMM0,XMM2;
- for (; i<=((n)-4); i+=4) {
+ __m128 XMM0, XMM2;
+ for (i=0; i<=((n)-4); i+=4) {
XMM0 = _mm_loadu_ps((x)+i);
XMM2 = _mm_loadu_ps((y)+i);
- XMM0 = _mm_mul_ps(XMM0, XMM7);
- XMM2 = _mm_add_ps(XMM2, XMM0);
- _mm_storeu_ps((y)+i , XMM2);
+ XMM2 = _mm_mul_ps(XMM2, XMM7);
+ XMM2 = _mm_add_ps(XMM0, XMM2);
+ _mm_storeu_ps((z)+i, XMM2);
}
for (; i<(n); i++) {
- y[i] += c * x[i];
+ z[i] = x[i] + c * y[i];
}
}
+static void THFloatVector_adds_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m128 XMM7 = _mm_set1_ps(c);
+ __m128 XMM0, XMM2;
+ for (i=0; i<=((n)-8); i+=8) {
+ XMM0 = _mm_loadu_ps((x)+i);
+ XMM2 = _mm_loadu_ps((x)+i+4);
+ XMM0 = _mm_add_ps(XMM0, XMM7);
+ XMM2 = _mm_add_ps(XMM2, XMM7);
+ _mm_storeu_ps((y)+i, XMM0);
+ _mm_storeu_ps((y)+i+4, XMM2);
+ }
+ for (; i<(n); i++) {
+ y[i] = x[i] + c;
+ }
+}
-static void THFloatVector_diff_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
+static void THFloatVector_cmul_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
ptrdiff_t i;
for (i=0; i<=((n)-16); i+=16) {
__m128 XMM0 = _mm_loadu_ps((x)+i );
@@ -156,62 +196,73 @@ static void THFloatVector_diff_SSE(float *z, const float *x, const float *y, con
__m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
__m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
__m128 XMM7 = _mm_loadu_ps((y)+i+12);
- XMM0 = _mm_sub_ps(XMM0, XMM4);
- XMM1 = _mm_sub_ps(XMM1, XMM5);
- XMM2 = _mm_sub_ps(XMM2, XMM6);
- XMM3 = _mm_sub_ps(XMM3, XMM7);
- _mm_storeu_ps((z)+i , XMM0);
- _mm_storeu_ps((z)+i+ 4, XMM1);
- _mm_storeu_ps((z)+i+ 8, XMM2);
- _mm_storeu_ps((z)+i+12, XMM3);
- }
- ptrdiff_t off = (n) - ((n)%16);
- for (i=0; i<((n)%16); i++) {
- z[off+i] = x[off+i] - y[off+i];
+ XMM4 = _mm_mul_ps(XMM4, XMM0);
+ XMM5 = _mm_mul_ps(XMM5, XMM1);
+ XMM6 = _mm_mul_ps(XMM6, XMM2);
+ XMM7 = _mm_mul_ps(XMM7, XMM3);
+ _mm_storeu_ps((z)+i , XMM4);
+ _mm_storeu_ps((z)+i+ 4, XMM5);
+ _mm_storeu_ps((z)+i+ 8, XMM6);
+ _mm_storeu_ps((z)+i+12, XMM7);
}
-}
-
-
-static void THFloatVector_scale_SSE(float *y, const float c, const ptrdiff_t n) {
- ptrdiff_t i;
- __m128 XMM7 = _mm_set_ps1(c);
- for (i=0; i<=((n)-8); i+=8) {
- __m128 XMM0 = _mm_loadu_ps((y)+i );
- __m128 XMM1 = _mm_loadu_ps((y)+i+4);
- XMM0 = _mm_mul_ps(XMM0, XMM7);
- XMM1 = _mm_mul_ps(XMM1, XMM7);
- _mm_storeu_ps((y)+i , XMM0);
- _mm_storeu_ps((y)+i+4, XMM1);
- }
- ptrdiff_t off = (n) - ((n)%8);
- for (i=0; i<((n)%8); i++) {
- y[off+i] *= c;
+ for (; i<(n); i++) {
+ z[i] = x[i] * y[i];
}
}
-
-static void THFloatVector_mul_SSE(float *y, const float *x, const ptrdiff_t n) {
+static void THFloatVector_muls_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
ptrdiff_t i;
+ __m128 XMM15 = _mm_set_ps1(c);
for (i=0; i<=((n)-16); i+=16) {
__m128 XMM0 = _mm_loadu_ps((x)+i );
__m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
__m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
__m128 XMM3 = _mm_loadu_ps((x)+i+12);
- __m128 XMM4 = _mm_loadu_ps((y)+i );
- __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
- __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
- __m128 XMM7 = _mm_loadu_ps((y)+i+12);
- XMM4 = _mm_mul_ps(XMM4, XMM0);
- XMM5 = _mm_mul_ps(XMM5, XMM1);
- XMM6 = _mm_mul_ps(XMM6, XMM2);
- XMM7 = _mm_mul_ps(XMM7, XMM3);
+ __m128 XMM4 = _mm_mul_ps(XMM15, XMM0);
+ __m128 XMM5 = _mm_mul_ps(XMM15, XMM1);
+ __m128 XMM6 = _mm_mul_ps(XMM15, XMM2);
+ __m128 XMM7 = _mm_mul_ps(XMM15, XMM3);
_mm_storeu_ps((y)+i , XMM4);
_mm_storeu_ps((y)+i+ 4, XMM5);
_mm_storeu_ps((y)+i+ 8, XMM6);
_mm_storeu_ps((y)+i+12, XMM7);
}
- ptrdiff_t off = (n) - ((n)%16);
- for (i=0; i<((n)%16); i++) {
- y[off+i] *= x[off+i];
+ for (; i<(n); i++) {
+ y[i] = x[i] * c;
+ }
+}
+
+static void THFloatVector_cdiv_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m128 XMM0, XMM1, XMM2, XMM3;
+ for (i=0; i<=((n)-8); i+=8) {
+ XMM0 = _mm_loadu_ps(x+i);
+ XMM1 = _mm_loadu_ps(x+i+4);
+ XMM2 = _mm_loadu_ps(y+i);
+ XMM3 = _mm_loadu_ps(y+i+4);
+ XMM2 = _mm_div_ps(XMM0, XMM2);
+ XMM3 = _mm_div_ps(XMM1, XMM3);
+ _mm_storeu_ps(z+i, XMM2);
+ _mm_storeu_ps(z+i+4, XMM3);
+ }
+ for (; i<(n); i++) {
+ z[i] = x[i] / y[i];
+ }
+}
+
+static void THFloatVector_divs_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
+ ptrdiff_t i;
+ __m128 XMM7 = _mm_set1_ps(c);
+ __m128 XMM0, XMM1;
+ for (i=0; i<=((n)-8); i+=8) {
+ XMM0 = _mm_loadu_ps(x+i);
+ XMM1 = _mm_loadu_ps(x+i+4);
+ XMM0 = _mm_div_ps(XMM0, XMM7);
+ XMM1 = _mm_div_ps(XMM1, XMM7);
+ _mm_storeu_ps(y+i, XMM0);
+ _mm_storeu_ps(y+i+4, XMM1);
+ }
+ for (; i<(n); i++) {
+ y[i] = x[i] / c;
}
}
diff --git a/test/test.lua b/test/test.lua
index e7e26e4..6221854 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -112,7 +112,7 @@ local genericSingleOpTest = [[
end
end
return maxerrc, maxerrnc
-]]
+--]]
function torchtest.sin()
local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'sin'))
@@ -343,6 +343,12 @@ function torchtest.round()
end
function torchtest.max() -- torch.max([resval, resind,] x [,dim])
+
+ -- TH_TENSOR_BASE
+ local m1 = torch.Tensor(8,2):fill(3):select(2, 1)
+ local resval, resind = torch.max(m1, 1)
+ mytester:assert(resind[1] == 1)
+
-- torch.max( x )
-- contiguous
local m1 = torch.randn(100,100)
@@ -357,6 +363,7 @@ function torchtest.max() -- torch.max([resval, resind,] x [,dim])
end
local err = res1 - res2
mytester:assertlt(err, precision, 'error in torch.max - contiguous')
+
-- non-contiguous
local m1 = torch.randn(10,10,10)
local m2 = m1[{{}, 4, {}}]
@@ -371,33 +378,34 @@ function torchtest.max() -- torch.max([resval, resind,] x [,dim])
end
local err = res1 - res2
mytester:assertlt(err, precision, 'error in torch.max - non-contiguous')
+
-- torch.max([resval, resind,] x ,dim])
- local m1 = torch.randn(100,100)
- local res1val, res1ind = torch.max(m1, 2)
- local res2val = res1val:clone():zero()
- local res2ind = res1ind:clone():zero()
- for i=1, m1:size(1) do
- res2val[i] = m1[i][1]
- res2ind[i] = 1
- for j=1, m1:size(2) do
- if m1[i][j] > res2val[i][1] then
- res2val[i] = m1[i][j]
- res2ind[i] = j
+ function lua_max(t, dim)
+ assert(t:nDimension() == 2)
+ max_val = t:narrow(dim, 1, 1):clone()
+ max_ind = t:narrow(dim, 1, 1):clone():long():fill(1)
+ other = 3 - dim
+ for i = 1, t:size(other) do
+ for j = 1, t:size(dim) do
+ val = t:select(other, i):select(dim, j)
+ max = max_val:select(other, i):select(dim, 1)
+ if val > max then
+ max_val:select(other, i):fill(val)
+ max_ind:select(other, i):fill(j)
+ end
end
end
+ return max_val, max_ind
end
- local errval = res1val:clone():zero()
- for i = 1, res1val:size(1) do
- errval[i] = math.abs(res1val[i][1] - res2val[i][1])
- mytester:asserteq(res1ind[i][1], res2ind[i][1], 'error in torch.max - non-contiguous')
- end
- local maxerr = 0
- for i = 1, errval:size(1) do
- if errval[i][1] > maxerr then
- maxerr = errval[i]
- end
+
+ local m1 = torch.randn(100,100)
+ for dim = 1,2 do
+ local res1val, res1ind = torch.max(m1, dim)
+ local res2val, res2ind = lua_max(m1, dim)
+ mytester:asserteq((res1val-res2val):abs():max(), 0, 'error in torch.max')
+ mytester:asserteq((res1ind-res2ind):abs():max(), 0, 'error in torch.max')
end
- mytester:assertlt(maxerr, precision, 'error in torch.max - non-contiguous')
+
-- NaNs
for index in pairs{1, 5, 100} do
local m1 = torch.randn(100)
@@ -439,33 +447,34 @@ function torchtest.min() -- torch.min([resval, resind,] x [,dim])
end
local err = res1 - res2
mytester:assertlt(err, precision, 'error in torch.min - non-contiguous')
- -- torch.min([resval, resind,] x ,dim])
- local m1 = torch.randn(100,100)
- local res1val, res1ind = torch.min(m1, 2)
- local res2val = res1val:clone():zero()
- local res2ind = res1ind:clone():zero()
- for i=1, m1:size(1) do
- res2val[i] = m1[i][1]
- res2ind[i] = 1
- for j=1, m1:size(2) do
- if m1[i][j] < res2val[i][1] then
- res2val[i] = m1[i][j]
- res2ind[i] = j
+
+ -- torch.max([resval, resind,] x ,dim])
+ function lua_min(t, dim)
+ assert(t:nDimension() == 2)
+ max_val = t:narrow(dim, 1, 1):clone()
+ max_ind = t:narrow(dim, 1, 1):clone():long():fill(1)
+ other = 3 - dim
+ for i = 1, t:size(other) do
+ for j = 1, t:size(dim) do
+ val = t:select(other, i):select(dim, j)
+ max = max_val:select(other, i):select(dim, 1)
+ if val < max then
+ max_val:select(other, i):fill(val)
+ max_ind:select(other, i):fill(j)
+ end
end
end
+ return max_val, max_ind
end
- local errval = res1val:clone():zero()
- for i = 1, res1val:size(1) do
- errval[i] = math.abs(res1val[i][1] - res2val[i][1])
- mytester:asserteq(res1ind[i][1], res2ind[i][1], 'error in torch.min - non-contiguous')
- end
- local minerr = 0
- for i = 1, errval:size(1) do
- if errval[i][1] < minerr then
- minerr = errval[i]
- end
+
+ local m1 = torch.randn(100,100)
+ for dim = 1,2 do
+ local res1val, res1ind = torch.min(m1, dim)
+ local res2val, res2ind = lua_min(m1, dim)
+ mytester:asserteq((res1val-res2val):abs():max(), 0, 'error in torch.max')
+ mytester:asserteq((res1ind-res2ind):abs():max(), 0, 'error in torch.max')
end
- mytester:assertlt(minerr, precision, 'error in torch.min - non-contiguous')
+
-- NaNs
for index in pairs{1, 5, 100} do
local m1 = torch.randn(100)
@@ -476,6 +485,11 @@ function torchtest.min() -- torch.min([resval, resind,] x [,dim])
local res1val = torch.min(m1)
mytester:assert(res1val ~= res1val, 'error in torch.min - NaNs')
end
+
+ -- TH_TENSOR_BASE
+ local m1 = torch.Tensor(4):fill(3)
+ local resval, resind = torch.min(m1, 1)
+ mytester:assert(resind[1] == 1)
end
function torchtest.cmax()
@@ -574,64 +588,117 @@ function torchtest.mv()
mytester:assertlt(err, precision, 'error in torch.mv')
end
-function torchtest.add()
- -- [res] torch.add([res,] tensor1, tensor2)
- local m1 = torch.randn(100,100)
- local v1 = torch.randn(100)
+function torchtest.fill()
+ local types = {
+ 'torch.ByteTensor',
+ 'torch.CharTensor',
+ 'torch.ShortTensor',
+ 'torch.IntTensor',
+ 'torch.FloatTensor',
+ 'torch.DoubleTensor',
+ 'torch.LongTensor',
+ }
- local res1 = torch.add(m1[{ 4,{} }],v1)
+ for k,t in ipairs(types) do
+ -- [res] torch.fill([res,] tensor, value)
+ local m1 = torch.ones(100,100):type(t)
+ local res1 = m1:clone()
+ res1[{ 3,{} }]:fill(2)
+
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ res2[{ 3,i }] = 2
+ end
- local res2 = res1:clone():zero()
- for i = 1,m1:size(2) do
- res2[i] = m1[4][i] + v1[i]
- end
+ local err = (res1-res2):double():abs():max()
- local err = (res1-res2):abs():max()
+ mytester:assertlt(err, precision, 'error in torch.fill - contiguous')
- mytester:assertlt(err, precision, 'error in torch.add - contiguous')
+ local m1 = torch.ones(100,100):type(t)
+ local res1 = m1:clone()
+ res1[{ {},3 }]:fill(2)
- local m1 = torch.randn(100,100)
- local v1 = torch.randn(100)
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ res2[{ i,3 }] = 2
+ end
- local res1 = torch.add(m1[{ {},4 }],v1)
+ local err = (res1-res2):double():abs():max()
- local res2 = res1:clone():zero()
- for i = 1,m1:size(1) do
- res2[i] = m1[i][4] + v1[i]
+ mytester:assertlt(err, precision, 'error in torch.fill - non contiguous')
end
+end
- local err = (res1-res2):abs():max()
+function torchtest.add()
+ local types = {
+ 'torch.ByteTensor',
+ 'torch.CharTensor',
+ 'torch.ShortTensor',
+ 'torch.IntTensor',
+ 'torch.FloatTensor',
+ 'torch.DoubleTensor',
+ 'torch.LongTensor',
+ }
- mytester:assertlt(err, precision, 'error in torch.add - non contiguous')
+ for k,t in ipairs(types) do
+ -- [res] torch.add([res,] tensor1, tensor2)
+ local m1 = torch.randn(100,100):type(t)
+ local v1 = torch.randn(100):type(t)
- -- [res] torch.add([res,] tensor, value)
- local m1 = torch.randn(10,10)
- local res1 = m1:clone()
- res1[{ 3,{} }]:add(2)
+ local res1 = torch.add(m1[{ 4,{} }],v1)
- local res2 = m1:clone()
- for i = 1,m1:size(1) do
- res2[{ 3,i }] = res2[{ 3,i }] + 2
- end
+ local res2 = res1:clone():zero()
+ for i = 1,m1:size(2) do
+ res2[i] = m1[4][i] + v1[i]
+ end
- local err = (res1-res2):abs():max()
+ local err = (res1-res2):double():abs():max()
- mytester:assertlt(err, precision, 'error in torch.add - scalar, contiguous')
+ mytester:assertlt(err, precision, 'error in torch.add - contiguous' .. ' ' .. t)
- local m1 = torch.randn(10,10)
- local res1 = m1:clone()
- res1[{ {},3 }]:add(2)
+ local m1 = torch.randn(100,100):type(t)
+ local v1 = torch.randn(100):type(t)
- local res2 = m1:clone()
- for i = 1,m1:size(1) do
- res2[{ i,3 }] = res2[{ i,3 }] + 2
- end
+ local res1 = torch.add(m1[{ {},4 }],v1)
- local err = (res1-res2):abs():max()
+ local res2 = res1:clone():zero()
+ for i = 1,m1:size(1) do
+ res2[i] = m1[i][4] + v1[i]
+ end
+
+ local err = (res1-res2):double():abs():max()
+
+ mytester:assertlt(err, precision, 'error in torch.add - non contiguous' .. ' ' .. t)
+
+ -- [res] torch.add([res,] tensor, value)
+ local m1 = torch.randn(10,10):type(t)
+ local res1 = m1:clone()
+ res1[{ 3,{} }]:add(2)
+
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ res2[{ 3,i }] = res2[{ 3,i }] + 2
+ end
+
+ local err = (res1-res2):double():abs():max()
+
+ mytester:assertlt(err, precision, 'error in torch.add - scalar, contiguous' .. ' ' .. t)
+
+ local m1 = torch.randn(10,10)
+ local res1 = m1:clone()
+ res1[{ {},3 }]:add(2)
+
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ res2[{ i,3 }] = res2[{ i,3 }] + 2
+ end
+
+ local err = (res1-res2):abs():max()
- mytester:assertlt(err, precision, 'error in torch.add - scalar, non contiguous')
+ mytester:assertlt(err, precision, 'error in torch.add - scalar, non contiguous' .. ' ' .. t)
- -- [res] torch.add([res,] tensor1, value, tensor2)
+ -- [res] torch.add([res,] tensor1, value, tensor2)
+ end
end
function torchtest.csub()
@@ -699,35 +766,130 @@ function torchtest.cinv()
end
function torchtest.mul()
- local m1 = torch.randn(10,10)
+ local types = {
+ 'torch.ByteTensor',
+ 'torch.CharTensor',
+ 'torch.ShortTensor',
+ 'torch.IntTensor',
+ 'torch.FloatTensor',
+ 'torch.DoubleTensor',
+ 'torch.LongTensor',
+ }
+
+ for k,t in ipairs(types) do
+ local m1 = torch.randn(10,10):type(t)
+ local res1 = m1:clone()
+
+ res1[{ {},3 }]:mul(2)
+
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ res2[{ i,3 }] = res2[{ i,3 }] * 2
+ end
+
+ local err = (res1-res2):double():abs():max()
+
+ mytester:assertlt(err, precision, 'error in torch.mul - scalar, non contiguous' .. ' ' .. t)
+ end
+end
+
+function torchtest.div()
+ local types = {
+ 'torch.ByteTensor',
+ 'torch.CharTensor',
+ 'torch.ShortTensor',
+ 'torch.IntTensor',
+ 'torch.FloatTensor',
+ 'torch.DoubleTensor',
+ 'torch.LongTensor',
+ }
+
+ for k,t in ipairs(types) do
+
+ local m1 = torch.randn(10,10):type(t)
+ local res1 = m1:clone()
+
+ res1[{ {},3 }]:div(2)
+
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ res2[{ i,3 }] = res2[{ i,3 }] / 2
+ end
+
+ local err = (res1-res2):double():abs():max()
+
+ mytester:assertlt(err, precision, 'error in torch.div - scalar, non contiguous' .. ' ' .. t)
+ end
+end
+
+function torchtest.lshift()
+ local m1 = torch.LongTensor(10,10):random(0,100)
+ local res1 = m1:clone()
+
+ local q = 2
+ local f = math.pow(2, q)
+ res1[{ {},3 }]:lshift(q)
+
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ res2[{ i,3 }] = res2[{ i,3 }] * f
+ end
+
+ local err = (res1-res2):abs():max()
+
+ mytester:assertlt(err, precision, 'error in torch.lshift - scalar, non contiguous')
+
+ local m1 = torch.LongTensor(10,10):random(0,100)
local res1 = m1:clone()
- res1[{ {},3 }]:mul(2)
+ local q = 2
+ res1:lshift(q)
local res2 = m1:clone()
for i = 1,m1:size(1) do
- res2[{ i,3 }] = res2[{ i,3 }] * 2
+ for j = 1,m1:size(1) do
+ res2[{ i,j }] = res2[{ i,j }] * f
+ end
end
local err = (res1-res2):abs():max()
- mytester:assertlt(err, precision, 'error in torch.mul - scalar, non contiguous')
+ mytester:assertlt(err, precision, 'error in torch.lshift - scalar, contiguous')
end
-function torchtest.div()
- local m1 = torch.randn(10,10)
+function torchtest.rshift()
+ local m1 = torch.LongTensor(10,10):random(0,100)
+ local res1 = m1:clone()
+
+ local q = 2
+ local f = math.pow(2, q)
+ res1[{ {},3 }]:rshift(q)
+
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ res2[{ i,3 }] = math.floor(res2[{ i,3 }] / f)
+ end
+
+ local err = (res1-res2):abs():max()
+
+ mytester:assertlt(err, precision, 'error in torch.rshift - scalar, non contiguous')
+
+ local m1 = torch.LongTensor(10,10):random(0,100)
local res1 = m1:clone()
- res1[{ {},3 }]:div(2)
+ local q = 2
+ res1:rshift(q)
local res2 = m1:clone()
for i = 1,m1:size(1) do
- res2[{ i,3 }] = res2[{ i,3 }] / 2
+ for j = 1,m1:size(1) do
+ res2[{ i,j }] = math.floor(res2[{ i,j }] / f)
+ end
end
local err = (res1-res2):abs():max()
- mytester:assertlt(err, precision, 'error in torch.div - scalar, non contiguous')
+ mytester:assertlt(err, precision, 'error in torch.rshift - scalar, contiguous')
end
function torchtest.fmod()
@@ -764,6 +926,86 @@ function torchtest.remainder()
mytester:assertlt(err, precision, 'error in torch.remainder - scalar, non contiguous')
end
+function torchtest.bitand()
+ local m1 = torch.LongTensor(10,10):random(0,100)
+ local res1 = m1:clone()
+
+ local val = 32 -- This should be a power of 2
+ res1[{ {},3 }]:bitand(val - 1)
+
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ res2[{ i,3 }] = res2[{ i,3 }] % val
+ end
+
+ local err = (res1-res2):abs():max()
+
+ mytester:assertlt(err, precision, 'error in torch.bitand - scalar, non contiguous')
+
+ local m1 = torch.LongTensor(10,10):random(0,100)
+ local res1 = m1:clone()
+
+ res1:bitand(val - 1)
+
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ for j = 1,m1:size(1) do
+ res2[{ i,j }] = res2[{ i,j }] % val
+ end
+ end
+
+ local err = (res1-res2):abs():max()
+
+ mytester:assertlt(err, precision, 'error in torch.bitand - scalar, contiguous')
+end
+
+function torchtest.bitor()
+ local m1 = torch.LongTensor(10,10):random(0,10000)
+ local res1 = m1:clone()
+
+ local val = 32 -- This should be a power of 2
+ res1[{ {},3 }]:bitor(val-1)
+
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ res2[{ i,3 }] = math.floor(res2[{ i,3 }] / val) * val + (val - 1)
+ end
+
+ local err = (res1-res2):abs():max()
+
+ mytester:assertlt(err, precision, 'error in torch.bitor - scalar, non contiguous')
+
+ local m1 = torch.LongTensor(10,10):random(0,10000)
+ local res1 = m1:clone()
+
+ res1:bitor(val - 1)
+
+ local res2 = m1:clone()
+ for i = 1,m1:size(1) do
+ for j = 1,m1:size(1) do
+ res2[{ i,j }] = math.floor(res2[{ i,j }] / val) * val + (val - 1)
+ end
+ end
+
+ local err = (res1-res2):abs():max()
+
+ mytester:assertlt(err, precision, 'error in torch.bitor - scalar, contiguous')
+end
+
+function torchtest.cbitxor()
+ local t1 = torch.LongTensor(10,10):random(0,10000)
+ local t2 = torch.LongTensor(10,10):random(10001,20000)
+
+ -- Perform xor swap and check results
+ local t3 = torch.cbitxor(t1, t2)
+ local r1 = torch.cbitxor(t3, t2)
+ local r2 = torch.cbitxor(t3, t1)
+
+ local err1 = (r1 - t1):abs():max()
+ local err2 = (r2 - t2):abs():max()
+ mytester:assertlt(err1 + err2, precision, 'error in torch.cbitxor contiguous')
+end
+
function torchtest.mm()
-- helper function
local function matrixmultiply(mat1,mat2)
@@ -1016,68 +1258,84 @@ function torchtest.pow() -- [res] torch.pow([res,] x)
mytester:assertlt(maxerr, precision, 'error in torch.pow - non-contiguous')
end
-function torchtest.cdiv() -- [res] torch.cdiv([res,] tensor1, tensor2)
- -- contiguous
- local m1 = torch.randn(10, 10, 10)
- local m2 = torch.randn(10, 10 * 10)
- local sm1 = m1[{4, {}, {}}]
- local sm2 = m2[{4, {}}]
- local res1 = torch.cdiv(sm1, sm2)
- local res2 = res1:clone():zero()
- for i = 1,sm1:size(1) do
- for j = 1, sm1:size(2) do
- local idx1d = (((i-1)*sm1:size(1)))+j
- res2[i][j] = sm1[i][j] / sm2[idx1d]
- end
- end
- local err = res1:clone():zero()
- -- find absolute error
- for i = 1, res1:size(1) do
- for j = 1, res1:size(2) do
- err[i][j] = math.abs(res1[i][j] - res2[i][j])
- end
- end
- -- find maximum element of error
- local maxerr = 0
- for i = 1, err:size(1) do
- for j = 1, err:size(2) do
- if err[i][j] > maxerr then
- maxerr = err[i][j]
- end
- end
- end
- mytester:assertlt(maxerr, precision, 'error in torch.cdiv - contiguous')
-
- -- non-contiguous
- local m1 = torch.randn(10, 10, 10)
- local m2 = torch.randn(10 * 10, 10 * 10)
- local sm1 = m1[{{}, 4, {}}]
- local sm2 = m2[{{}, 4}]
- local res1 = torch.cdiv(sm1, sm2)
- local res2 = res1:clone():zero()
- for i = 1,sm1:size(1) do
- for j = 1, sm1:size(2) do
- local idx1d = (((i-1)*sm1:size(1)))+j
- res2[i][j] = sm1[i][j] / sm2[idx1d]
- end
- end
- local err = res1:clone():zero()
- -- find absolute error
- for i = 1, res1:size(1) do
- for j = 1, res1:size(2) do
- err[i][j] = math.abs(res1[i][j] - res2[i][j])
- end
- end
- -- find maximum element of error
- local maxerr = 0
- for i = 1, err:size(1) do
- for j = 1, err:size(2) do
- if err[i][j] > maxerr then
- maxerr = err[i][j]
- end
- end
+function torchtest.cdiv()
+ local types = {
+ 'torch.ByteTensor',
+ 'torch.CharTensor',
+ 'torch.ShortTensor',
+ 'torch.IntTensor',
+ 'torch.FloatTensor',
+ 'torch.DoubleTensor',
+ 'torch.LongTensor',
+ }
+
+ for k,t in ipairs(types) do
+
+ -- [res] torch.cdiv([res,] tensor1, tensor2)
+ -- contiguous
+ local m1 = torch.randn(10, 10, 10):type(t)
+ local m2 = torch.randn(10, 10 * 10):type(t)
+ m2[m2:eq(0)] = 2
+ local sm1 = m1[{4, {}, {}}]
+ local sm2 = m2[{4, {}}]
+ local res1 = torch.cdiv(sm1, sm2)
+ local res2 = res1:clone():zero()
+ for i = 1,sm1:size(1) do
+ for j = 1, sm1:size(2) do
+ local idx1d = (((i-1)*sm1:size(1)))+j
+ res2[i][j] = sm1[i][j] / sm2[idx1d]
+ end
+ end
+ local err = res1:clone():zero()
+ -- find absolute error
+ for i = 1, res1:size(1) do
+ for j = 1, res1:size(2) do
+ err[i][j] = math.abs(res1[i][j] - res2[i][j])
+ end
+ end
+ -- find maximum element of error
+ local maxerr = 0
+ for i = 1, err:size(1) do
+ for j = 1, err:size(2) do
+ if err[i][j] > maxerr then
+ maxerr = err[i][j]
+ end
+ end
+ end
+ mytester:assertlt(maxerr, precision, 'error in torch.cdiv - contiguous' .. ' ' .. t)
+
+ -- non-contiguous
+ local m1 = torch.randn(10, 10, 10):type(t)
+ local m2 = torch.randn(10 * 10, 10 * 10):type(t)
+ m2[m2:eq(0)] = 2
+ local sm1 = m1[{{}, 4, {}}]
+ local sm2 = m2[{{}, 4}]
+ local res1 = torch.cdiv(sm1, sm2)
+ local res2 = res1:clone():zero()
+ for i = 1,sm1:size(1) do
+ for j = 1, sm1:size(2) do
+ local idx1d = (((i-1)*sm1:size(1)))+j
+ res2[i][j] = sm1[i][j] / sm2[idx1d]
+ end
+ end
+ local err = res1:clone():zero()
+ -- find absolute error
+ for i = 1, res1:size(1) do
+ for j = 1, res1:size(2) do
+ err[i][j] = math.abs(res1[i][j] - res2[i][j])
+ end
+ end
+ -- find maximum element of error
+ local maxerr = 0
+ for i = 1, err:size(1) do
+ for j = 1, err:size(2) do
+ if err[i][j] > maxerr then
+ maxerr = err[i][j]
+ end
+ end
+ end
+ mytester:assertlt(maxerr, precision, 'error in torch.cdiv - non-contiguous' .. ' ' .. t)
end
- mytester:assertlt(maxerr, precision, 'error in torch.cdiv - non-contiguous')
end
function torchtest.cfmod()
@@ -1208,68 +1466,82 @@ function torchtest.cremainder()
mytester:assertlt(maxerr, precision, 'error in torch.cremainder - non-contiguous')
end
-function torchtest.cmul() -- [res] torch.cmul([res,] tensor1, tensor2)
- -- contiguous
- local m1 = torch.randn(10, 10, 10)
- local m2 = torch.randn(10, 10 * 10)
- local sm1 = m1[{4, {}, {}}]
- local sm2 = m2[{4, {}}]
- local res1 = torch.cmul(sm1, sm2)
- local res2 = res1:clone():zero()
- for i = 1,sm1:size(1) do
- for j = 1, sm1:size(2) do
- local idx1d = (((i-1)*sm1:size(1)))+j
- res2[i][j] = sm1[i][j] * sm2[idx1d]
- end
- end
- local err = res1:clone():zero()
- -- find absolute error
- for i = 1, res1:size(1) do
- for j = 1, res1:size(2) do
- err[i][j] = math.abs(res1[i][j] - res2[i][j])
- end
- end
- -- find maximum element of error
- local maxerr = 0
- for i = 1, err:size(1) do
- for j = 1, err:size(2) do
- if err[i][j] > maxerr then
- maxerr = err[i][j]
- end
- end
- end
- mytester:assertlt(maxerr, precision, 'error in torch.cmul - contiguous')
-
- -- non-contiguous
- local m1 = torch.randn(10, 10, 10)
- local m2 = torch.randn(10 * 10, 10 * 10)
- local sm1 = m1[{{}, 4, {}}]
- local sm2 = m2[{{}, 4}]
- local res1 = torch.cmul(sm1, sm2)
- local res2 = res1:clone():zero()
- for i = 1,sm1:size(1) do
- for j = 1, sm1:size(2) do
- local idx1d = (((i-1)*sm1:size(1)))+j
- res2[i][j] = sm1[i][j] * sm2[idx1d]
- end
- end
- local err = res1:clone():zero()
- -- find absolute error
- for i = 1, res1:size(1) do
- for j = 1, res1:size(2) do
- err[i][j] = math.abs(res1[i][j] - res2[i][j])
- end
- end
- -- find maximum element of error
- local maxerr = 0
- for i = 1, err:size(1) do
- for j = 1, err:size(2) do
- if err[i][j] > maxerr then
- maxerr = err[i][j]
- end
- end
- end
- mytester:assertlt(maxerr, precision, 'error in torch.cmul - non-contiguous')
+function torchtest.cmul()
+ local types = {
+ 'torch.ByteTensor',
+ 'torch.CharTensor',
+ 'torch.ShortTensor',
+ 'torch.IntTensor',
+ 'torch.FloatTensor',
+ 'torch.DoubleTensor',
+ 'torch.LongTensor',
+ }
+
+ for k,t in ipairs(types) do
+
+ -- [res] torch.cmul([res,] tensor1, tensor2)
+ -- contiguous
+ local m1 = torch.randn(10, 10, 10):type(t)
+ local m2 = torch.randn(10, 10 * 10):type(t)
+ local sm1 = m1[{4, {}, {}}]
+ local sm2 = m2[{4, {}}]
+ local res1 = torch.cmul(sm1, sm2)
+ local res2 = res1:clone():zero()
+ for i = 1,sm1:size(1) do
+ for j = 1, sm1:size(2) do
+ local idx1d = (((i-1)*sm1:size(1)))+j
+ res2[i][j] = sm1[i][j] * sm2[idx1d]
+ end
+ end
+ local err = res1:clone():zero()
+ -- find absolute error
+ for i = 1, res1:size(1) do
+ for j = 1, res1:size(2) do
+ err[i][j] = math.abs(res1[i][j] - res2[i][j])
+ end
+ end
+ -- find maximum element of error
+ local maxerr = 0
+ for i = 1, err:size(1) do
+ for j = 1, err:size(2) do
+ if err[i][j] > maxerr then
+ maxerr = err[i][j]
+ end
+ end
+ end
+ mytester:assertlt(maxerr, precision, 'error in torch.cmul - contiguous' .. ' ' .. t)
+
+ -- non-contiguous
+ local m1 = torch.randn(10, 10, 10):type(t)
+ local m2 = torch.randn(10 * 10, 10 * 10):type(t)
+ local sm1 = m1[{{}, 4, {}}]
+ local sm2 = m2[{{}, 4}]
+ local res1 = torch.cmul(sm1, sm2)
+ local res2 = res1:clone():zero()
+ for i = 1,sm1:size(1) do
+ for j = 1, sm1:size(2) do
+ local idx1d = (((i-1)*sm1:size(1)))+j
+ res2[i][j] = sm1[i][j] * sm2[idx1d]
+ end
+ end
+ local err = res1:clone():zero()
+ -- find absolute error
+ for i = 1, res1:size(1) do
+ for j = 1, res1:size(2) do
+ err[i][j] = math.abs(res1[i][j] - res2[i][j])
+ end
+ end
+ -- find maximum element of error
+ local maxerr = 0
+ for i = 1, err:size(1) do
+ for j = 1, err:size(2) do
+ if err[i][j] > maxerr then
+ maxerr = err[i][j]
+ end
+ end
+ end
+ mytester:assertlt(maxerr, precision, 'error in torch.cmul - non-contiguous' .. ' ' .. t)
+ end
end
function torchtest.cpow() -- [res] torch.cpow([res,] tensor1, tensor2)
@@ -1342,6 +1614,16 @@ function torchtest.sum()
local mxx = torch.Tensor()
torch.sum(mxx,x,2)
mytester:asserteq(maxdiff(mx,mxx),0,'torch.sum value')
+
+ local y = torch.rand(5, 5, 5)
+ for i=1,3 do
+ local a = y:sum(i)
+ local b = y:narrow(i, 1, 1):clone():zero()
+ for j = 1, 5 do
+ b:add(y:narrow(i, j, 1))
+ end
+ mytester:asserteq(maxdiff(a, b), 0, 'torch.sum value')
+ end
end
function torchtest.prod()
local x = torch.rand(msize,msize)
@@ -1349,6 +1631,16 @@ function torchtest.prod()
local mxx = torch.Tensor()
torch.prod(mxx,x,2)
mytester:asserteq(maxdiff(mx,mxx),0,'torch.prod value')
+
+ local y = torch.rand(5, 5, 5)
+ for i=1,3 do
+ local a = y:prod(i)
+ local b = y:narrow(i, 1, 1):clone():fill(1)
+ for j = 1, 5 do
+ b:cmul(y:narrow(i, j, 1))
+ end
+ mytester:asserteq(maxdiff(a, b), 0, 'torch.sum value')
+ end
end
function torchtest.cumsum()
local x = torch.rand(msize,msize)
@@ -1942,6 +2234,29 @@ function torchtest.catArray()
local mx = torch.cat({x,y})
mytester:asserteq(mx:dim(),0,'torch.cat dim')
end
+function torchtest.catNoDim()
+ local a
+ local b
+ local c
+
+ a = torch.Tensor(msize):uniform()
+ b = torch.Tensor(msize):uniform()
+ c = torch.cat(a, b)
+ mytester:assertTensorEq(c:narrow(1, 1, msize), a, 0, 'torch.cat value')
+ mytester:assertTensorEq(c:narrow(1, msize + 1, msize), b, 0, 'torch.cat value')
+
+ a = torch.Tensor(1, msize):uniform()
+ b = torch.Tensor(1, msize):uniform()
+ c = torch.cat(a, b)
+ mytester:assertTensorEq(c:narrow(2, 1, msize), a, 0, 'torch.cat value')
+ mytester:assertTensorEq(c:narrow(2, msize + 1, msize), b, 0, 'torch.cat value')
+
+ a = torch.Tensor(10, msize):uniform()
+ b = torch.Tensor(10, msize):uniform()
+ c = torch.cat(a, b)
+ mytester:assertTensorEq(c:narrow(2, 1, msize), a, 0, 'torch.cat value')
+ mytester:assertTensorEq(c:narrow(2, msize + 1, msize), b, 0, 'torch.cat value')
+end
function torchtest.sin_2()
local x = torch.rand(msize,msize,msize)
local mx = torch.sin(x)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-torch7.git
More information about the debian-science-commits
mailing list