[lua-torch-torch7] 01/11: New upstream version 0~20160908-ge5ebac6
Zhou Mo
cdluminate-guest at moszumanska.debian.org
Sat Sep 10 04:47:33 UTC 2016
This is an automated email from the git hooks/post-receive script.
cdluminate-guest pushed a commit to branch master
in repository lua-torch-torch7.
commit d3f95c66472eb3fa63f2ffd166d41f87b50a81b4
Author: Zhou Mo <cdluminate at gmail.com>
Date: Sat Sep 10 03:25:40 2016 +0000
New upstream version 0~20160908-ge5ebac6
---
File.lua | 4 +-
ROADMAP.md | 2 +-
Tensor.lua | 2 +-
Tester.lua | 2 +-
doc/maths.md | 43 +-
doc/random.md | 10 +-
generic/Tensor.c | 1 +
lib/TH/CMakeLists.txt | 56 ++-
lib/TH/THAllocator.c | 305 ++++++++++--
lib/TH/THAllocator.h | 18 +-
lib/TH/THAtomic.h | 7 +
lib/TH/THDiskFile.c | 2 +-
lib/TH/THTensor.c | 1 +
lib/TH/THVector.c | 17 +
lib/TH/THVector.h | 571 +----------------------
lib/TH/generic/THStorage.c | 24 +-
lib/TH/generic/THStorage.h | 3 +-
lib/TH/generic/THTensorMath.c | 16 +-
lib/TH/generic/THTensorRandom.c | 2 +-
lib/TH/generic/THVector.h | 14 +
lib/TH/generic/{THVector.c => THVectorDefault.c} | 12 +-
lib/TH/generic/THVectorDispatch.c | 140 ++++++
lib/TH/generic/simd/simd.h | 91 ++++
lib/TH/vector/NEON.c | 252 ++++++++++
lib/TH/vector/SSE.c | 213 +++++++++
lib/luaT/CMakeLists.txt | 4 +
lib/luaT/README.md | 2 +-
test/test.lua | 9 +-
28 files changed, 1132 insertions(+), 691 deletions(-)
diff --git a/File.lua b/File.lua
index 1cc4dfe..62249a3 100644
--- a/File.lua
+++ b/File.lua
@@ -275,7 +275,7 @@ function File:readObject()
local dumped = self:readChar(size):string()
local func, err = loadstring(dumped)
if not func then
- error(string.format('Failed to load function from bytecode: %s', err))
+ io.stderr:write(string.format('Warning: Failed to load function from bytecode: %s', err))
end
local upvalues = self:readObject()
for index,upvalue in ipairs(upvalues) do
@@ -298,7 +298,7 @@ function File:readObject()
local dumped = self:readChar(size):string()
local func, err = loadstring(dumped)
if not func then
- error(string.format('Failed to load function from bytecode: %s', err))
+ io.stderr:write(string.format('Warning: Failed to load function from bytecode: %s', err))
end
if not force then
objects[index] = func
diff --git a/ROADMAP.md b/ROADMAP.md
index cb9c5ad..d906126 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -84,7 +84,7 @@ The roadmap focuses on five separate things
Also, I agree, I actually could not install iTorch on my laptop
before cvpr tutorial somehow, it did not want to work :).
- **soumith**: I think we should propose a common display API that any interface can implement,
- that way the users dont need to change scripts across different UI backends.
+ that way the users don't need to change scripts across different UI backends.
Also, szym/display is a good candidate for the Web UI, ITorch is indeed a bit of a pain to install.
- Should we endorse iTorch for everyone to use?
diff --git a/Tensor.lua b/Tensor.lua
index 0d573aa..b4b3e95 100644
--- a/Tensor.lua
+++ b/Tensor.lua
@@ -389,7 +389,7 @@ torch.repeatTensor = Tensor.repeatTensor
--- One of the size elements can be -1,
--- a new LongStorage is then returned.
--- The length of the unspecified dimension
- --- is infered from the number of remaining elements.
+ --- is inferred from the number of remaining elements.
local function specifyFully(size, nElements)
local nCoveredElements = 1
local remainingDim = nil
diff --git a/Tester.lua b/Tester.lua
index a3b3ff3..f512edb 100644
--- a/Tester.lua
+++ b/Tester.lua
@@ -236,7 +236,7 @@ function Tester:_assertTensorEqOrNeq(ta, tb, negate, ...)
if self._assertTensorEqIgnoresDims and (not negate) and success
and not ta:isSameSizeAs(tb) then
self:_warning("Tensors have the same content but different dimensions. "
- .. "For backwards compatability, they are considered equal, "
+ .. "For backwards compatibility, they are considered equal, "
.. "but this may change in the future. Consider using :eq "
.. "to check for equality instead.")
end
diff --git a/doc/maths.md b/doc/maths.md
index fa322e0..dd427ea 100755
--- a/doc/maths.md
+++ b/doc/maths.md
@@ -978,20 +978,17 @@ The number of elements must match: both `Tensor`s are seen as a 1D vector.
<a name="torch.addmv"></a>
-### [res] torch.addmv([res,] [beta,] [v1,] vec1, [v2,] mat, vec2) ###
+### [res] torch.addmv([res,] [v1,] vec1, [v2,] mat, vec2) ###
<a name="torch.addmv"></a>
Performs a matrix-vector multiplication between `mat` (2D `Tensor`) and `vec2` (1D `Tensor`) and add it to `vec1`.
Optional values `v1` and `v2` are scalars that multiply `vec1` and `vec2` respectively.
-Optional value `beta` is a scalar that scales the result `Tensor`, before accumulating the result into the `Tensor`.
-Defaults to `1.0`.
-
In other words,
```
-res = (beta * res) + (v1 * vec1) + (v2 * (mat * vec2))
+res = (v1 * vec1) + (v2 * (mat * vec2))
```
Sizes must respect the matrix-multiplication operation: if `mat` is a `n × m` matrix, `vec2` must be vector of size `m` and `vec1` must be a vector of size `n`.
@@ -1012,12 +1009,21 @@ Sizes must respect the matrix-multiplication operation: if `mat` is a `n × m` m
`torch.addmv(r, x, y, z)` puts the result in `r`.
-`x:addmv(y, z)` accumulates `y * z` into `x`.
+**Differences when used as a method**
+
+`x:addmv(y, z)` does `x = x + y * z`
+
+`r:addmv(x, y, z)` does `r = x + y * z` if x is a vector
-`r:addmv(x, y, z)` puts the result of `x + y * z` into `r` if `x` is a vector.
+`r:addmv(s, y, z)` does `r = r + s * y * z` if `s` is a scalar.
-`r:addmv(s, y, z)` puts the result of `s * r + y * z` into `r` if `s` is a scalar.
+`r:addmv(x, s, y, z)` does `r = x + s * y * z` if `s` is a scalar and `x` is a vector.
+`r:addmv(s1, s2, y, z)` does `r = s1 * r + s2 * y * z` if `s1` and `s2` are scalars.
+
+The last example does not accurately fit into the function signature, and needs a special mention. It changes the function signature to:
+
+`[vec1] = vec1:addmv([v1,] [v2,] mat, vec2)`
<a name="torch.addr"></a>
### [res] torch.addr([res,] [v1,] mat, [v2,] vec1, vec2) ###
@@ -1075,20 +1081,17 @@ If `vec1` is a vector of size `n` and `vec2` is a vector of size `m`, then `mat`
<a name="torch.addmm"></a>
-### [res] torch.addmm([res,] [beta,] [v1,] M, [v2,] mat1, mat2) ###
+### [res] torch.addmm([res,] [v1,] M, [v2,] mat1, mat2) ###
<a name="torch.addmm"></a>
Performs a matrix-matrix multiplication between `mat1` (2D `Tensor`) and `mat2` (2D `Tensor`).
Optional values `v1` and `v2` are scalars that multiply `M` and `mat1 * mat2` respectively.
-Optional value `beta` is a scalar that scales the result `Tensor`, before accumulating the result into the `Tensor`.
-Defaults to `1.0`.
-
In other words,
```
-res = (res * beta) + (v1 * M) + (v2 * mat1 * mat2)
+res = (v1 * M) + (v2 * mat1 * mat2)
```
If `mat1` is a `n × m` matrix, `mat2` a `m × p` matrix, `M` must be a `n × p` matrix.
@@ -1097,9 +1100,19 @@ If `mat1` is a `n × m` matrix, `mat2` a `m × p` matrix, `M` must be a `n × p`
`torch.addmm(r, M, mat1, mat2)` puts the result in `r`.
-`M:addmm(mat1, mat2)` puts the result in `M`.
+**Differences when used as a method**
+
+`M:addmm(mat1, mat2)` does `M = M + mat1 * mat2`.
+
+`r:addmm(M, mat1, mat2)` does `r = M + mat1 * mat2`.
+
+`r:addmm(v1, M, v2, mat1, mat2)` does `r = (v1 * M) + (v2 * mat1 * mat2)`.
+
+`M:addmm(v1, v2, mat1, mat2)` does `M = (v1 * M) + (v2 * mat1 * mat2)`.
+
+The last example does not accurately fit into the function signature, and needs a special mention. It changes the function signature to:
-`r:addmm(M, mat1, mat2)` puts the result in `r`.
+`[M] = M:addmm([v1,] [v2,] mat1, mat2)`
<a name="torch.addbmm"></a>
diff --git a/doc/random.md b/doc/random.md
index 7097edb..e6fa6ab 100644
--- a/doc/random.md
+++ b/doc/random.md
@@ -128,12 +128,12 @@ returns its argument, `state`.
<a name="torch.random"></a>
### [number] random([gen,] [a], [b]) ###
-Returns an unsigned 32 bit integer random number from [a,b]. By default `a` is 1 and `b` is 2^32.
+Returns an unsigned 32 bit integer random number from `[a,b]`. By default `a` is `1` and `b` is `2^32`.
<a name="torch.uniform"></a>
### [number] uniform([gen,] [a],[b]) ###
-Returns a random real number according to uniform distribution on [a,b). By default `a` is 0 and `b` is 1.
+Returns a random real number according to uniform distribution on `[a,b)`. By default `a` is `0` and `b` is `1`.
<a name="torch.normal"></a>
### [number] normal([gen,] [mean],[stdv]) ###
@@ -145,13 +145,13 @@ Returns a random real number according to a normal distribution with the given `
### [number] exponential([gen,] lambda) ###
Returns a random real number according to the exponential distribution
-''p(x) = lambda * exp(-lambda * x)''
+`p(x) = lambda * exp(-lambda * x)`
<a name="torch.cauchy"></a>
### [number] cauchy([gen,] median, sigma) ###
Returns a random real number according to the Cauchy distribution
-''p(x) = sigma/(pi*(sigma^2 + (x-median)^2))''
+`p(x) = sigma/(pi*(sigma^2 + (x-median)^2))`
<a name="torch.logNormal"></a>
### [number] logNormal([gen,] mean, stdv) ###
@@ -164,7 +164,7 @@ the given `mean` and standard deviation `stdv`.
### [number] geometric([gen,] p) ###
Returns a random integer number according to a geometric distribution
-''p(i) = (1-p) * p^(i-1)`. `p` must satisfy `0 < p < 1''.
+`p(i) = (1-p) * p^(i-1)`. `p` must satisfy `0 < p < 1`.
<a name="torch.bernoulli"></a>
### [number] bernoulli([gen,] [p]) ###
diff --git a/generic/Tensor.c b/generic/Tensor.c
index 0bf74e1..3067213 100644
--- a/generic/Tensor.c
+++ b/generic/Tensor.c
@@ -1318,6 +1318,7 @@ void torch_Tensor_(init)(lua_State *L)
torch_Tensor_(new), torch_Tensor_(free), torch_Tensor_(factory));
luaT_setfuncs(L, torch_Tensor_(_), 0);
lua_pop(L, 1);
+ THVector_(vectorDispatchInit)();
}
#endif
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index 551ea50..e1610af 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -70,28 +70,6 @@ IF (CORTEXA9_FOUND)
SET(CMAKE_C_FLAGS "-mcpu=cortex-a9 ${CMAKE_C_FLAGS}")
ENDIF (CORTEXA9_FOUND)
-IF(UNIX)
- INCLUDE(CheckFunctionExists)
- SET(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h")
- CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
- IF(HAVE_MMAP)
- ADD_DEFINITIONS(-DHAVE_MMAP=1)
- ENDIF(HAVE_MMAP)
- ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
- CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
- IF(HAVE_SHM_OPEN)
- ADD_DEFINITIONS(-DHAVE_SHM_OPEN=1)
- ENDIF(HAVE_SHM_OPEN)
- CHECK_FUNCTION_EXISTS(shm_unlink HAVE_SHM_UNLINK)
- IF(HAVE_SHM_UNLINK)
- ADD_DEFINITIONS(-DHAVE_SHM_UNLINK=1)
- ENDIF(HAVE_SHM_UNLINK)
- CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
- IF(HAVE_MALLOC_USABLE_SIZE)
- ADD_DEFINITIONS(-DHAVE_MALLOC_USABLE_SIZE=1)
- ENDIF(HAVE_MALLOC_USABLE_SIZE)
-ENDIF(UNIX)
-
FIND_PACKAGE(SSE)
IF(C_SSE2_FOUND)
SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}")
@@ -129,7 +107,7 @@ SET(hdr
SET(src
THGeneral.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
- THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c)
+ THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
SET(src ${src} ${hdr} ${simd})
ADD_LIBRARY(TH SHARED ${src})
@@ -137,6 +115,10 @@ if(BUILD_STATIC)
ADD_LIBRARY(TH_static STATIC ${src})
endif()
+SET_TARGET_PROPERTIES(TH PROPERTIES
+ VERSION 0
+ SOVERSION 0)
+
CHECK_C_SOURCE_RUNS("
#include <stdatomic.h>
int main()
@@ -220,9 +202,34 @@ IF (UNIX AND NOT APPLE)
CHECK_LIBRARY_EXISTS(rt clock_gettime "time.h" NEED_LIBRT)
IF(NEED_LIBRT)
TARGET_LINK_LIBRARIES(TH rt)
+ SET(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} rt)
ENDIF(NEED_LIBRT)
ENDIF(UNIX AND NOT APPLE)
+IF(UNIX)
+ INCLUDE(CheckFunctionExists)
+ SET(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h")
+ CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
+ IF(HAVE_MMAP)
+ ADD_DEFINITIONS(-DHAVE_MMAP=1)
+ ENDIF(HAVE_MMAP)
+ ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
+ CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
+ IF(HAVE_SHM_OPEN)
+ ADD_DEFINITIONS(-DHAVE_SHM_OPEN=1)
+ ENDIF(HAVE_SHM_OPEN)
+ CHECK_FUNCTION_EXISTS(shm_unlink HAVE_SHM_UNLINK)
+ IF(HAVE_SHM_UNLINK)
+ ADD_DEFINITIONS(-DHAVE_SHM_UNLINK=1)
+ ENDIF(HAVE_SHM_UNLINK)
+ CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
+ IF(HAVE_MALLOC_USABLE_SIZE)
+ ADD_DEFINITIONS(-DHAVE_MALLOC_USABLE_SIZE=1)
+ ENDIF(HAVE_MALLOC_USABLE_SIZE)
+ENDIF(UNIX)
+
+
+
IF(NOT MSVC)
TARGET_LINK_LIBRARIES(TH m)
ENDIF(NOT MSVC)
@@ -327,7 +334,8 @@ INSTALL(FILES
generic/THTensorMath.h
generic/THTensorRandom.c
generic/THTensorRandom.h
- generic/THVector.c
+ generic/THVectorDispatch.c
+ generic/THVector.h
DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH/generic")
diff --git a/lib/TH/THAllocator.c b/lib/TH/THAllocator.c
index 6992544..d64b752 100644
--- a/lib/TH/THAllocator.c
+++ b/lib/TH/THAllocator.c
@@ -1,4 +1,5 @@
#include "THAllocator.h"
+#include "THAtomic.h"
/* stuff for mapped files */
#ifdef _WIN32
@@ -36,22 +37,60 @@ THAllocator THDefaultAllocator = {
struct THMapAllocatorContext_ {
char *filename; /* file name */
- int shared; /* is shared or not */
+ int flags;
long size; /* mapped size */
+ int fd;
};
-THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int shared)
+#define TH_ALLOC_ALIGNMENT 64
+
+typedef struct {
+ int refcount;
+} THMapInfo;
+
+char * unknown_filename = "filename not specified";
+
+THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags)
{
THMapAllocatorContext *ctx = THAlloc(sizeof(THMapAllocatorContext));
- ctx->filename = THAlloc(strlen(filename)+1);
- strcpy(ctx->filename, filename);
- ctx->shared = shared;
+ if (!(flags & TH_ALLOCATOR_MAPPED_SHARED) && !(flags & TH_ALLOCATOR_MAPPED_SHAREDMEM))
+ flags &= ~TH_ALLOCATOR_MAPPED_NOCREATE;
+ if ((flags ^ TH_ALLOCATOR_MAPPED_EXCLUSIVE) == 0)
+ THError("TH_ALLOCATOR_MAPPED_EXCLUSIVE flag requires opening the file "
+ "in shared mode");
+
+ if (filename) {
+ ctx->filename = THAlloc(strlen(filename)+1);
+ strcpy(ctx->filename, filename);
+ } else {
+ ctx->filename = unknown_filename;
+ }
+ ctx->flags = flags;
ctx->size = 0;
+ ctx->fd = -1;
+
+ return ctx;
+}
+
+THMapAllocatorContext *THMapAllocatorContext_newWithFd(const char *filename, int fd, int flags)
+{
+ THMapAllocatorContext *ctx = THMapAllocatorContext_new(filename, flags);
+ ctx->fd = fd;
return ctx;
}
+char * THMapAllocatorContext_filename(THMapAllocatorContext *ctx)
+{
+ return ctx->filename;
+}
+
+int THMapAllocatorContext_fd(THMapAllocatorContext *ctx)
+{
+ return ctx->fd;
+}
+
long THMapAllocatorContext_size(THMapAllocatorContext *ctx)
{
return ctx->size;
@@ -59,11 +98,12 @@ long THMapAllocatorContext_size(THMapAllocatorContext *ctx)
void THMapAllocatorContext_free(THMapAllocatorContext *ctx)
{
- THFree(ctx->filename);
+ if (ctx->filename != unknown_filename)
+ THFree(ctx->filename);
THFree(ctx);
}
-static void *THMapAllocator_alloc(void* ctx_, long size)
+static void *_map_alloc(void* ctx_, long size)
{
THMapAllocatorContext *ctx = ctx_;
void *data = NULL;
@@ -75,9 +115,18 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
DWORD size_hi, size_lo;
size_t hfilesz;
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_EXCLUSIVE)
+ THError("exclusive file mapping is not supported on Windows");
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_NOCREATE)
+ THError("file mapping without creation is not supported on Windows");
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD)
+ THError("TH_ALLOCATOR_MAPPED_KEEPFD not supported on Windows");
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD)
+ THError("TH_ALLOCATOR_MAPPED_FROMFD not supported on Windows");
+
/* open file */
/* FILE_FLAG_RANDOM_ACCESS ? */
- if(ctx->shared)
+ if(ctx->flags)
{
hfile = CreateFileA(ctx->filename, GENERIC_READ|GENERIC_WRITE, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0);
if (hfile == INVALID_HANDLE_VALUE)
@@ -103,7 +152,7 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
{
if(size > hfilesz)
{
- if(ctx->shared)
+ if(ctx->flags)
{
#if SIZEOF_SIZE_T > 4
size_hi = (DWORD)((size) >> 32);
@@ -144,7 +193,7 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
#endif
/* get map handle */
- if(ctx->shared)
+ if(ctx->flags)
{
if( (hmfile = CreateFileMapping(hfile, NULL, PAGE_READWRITE, size_hi, size_lo, NULL)) == NULL )
THError("could not create a map on file <%s>", ctx->filename);
@@ -156,66 +205,89 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
}
/* map the stuff */
- if(ctx->shared)
+ if(ctx->flags)
data = MapViewOfFile(hmfile, FILE_MAP_ALL_ACCESS, 0, 0, 0);
else
data = MapViewOfFile(hmfile, FILE_MAP_COPY, 0, 0, 0);
- CloseHandle(hfile);
- CloseHandle(hmfile);
+ CloseHandle(hfile);
+ CloseHandle(hmfile);
}
#else /* _WIN32 */
{
/* open file */
int fd;
- long fdsz;
+ int flags;
+ struct stat file_stat;
- if(ctx->shared == TH_ALLOCATOR_MAPPED_SHARED)
- {
- if((fd = open(ctx->filename, O_RDWR | O_CREAT, (mode_t)0600)) == -1)
- THError("unable to open file <%s> in read-write mode", ctx->filename);
- }
- else if (ctx->shared == TH_ALLOCATOR_MAPPED_SHAREDMEM)
- {
+ if (ctx->flags & (TH_ALLOCATOR_MAPPED_SHARED | TH_ALLOCATOR_MAPPED_SHAREDMEM))
+ flags = O_RDWR | O_CREAT;
+ else
+ flags = O_RDONLY;
+
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_EXCLUSIVE)
+ flags |= O_EXCL;
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_NOCREATE)
+ flags &= ~O_CREAT;
+
+ if (!(ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD)) {
+ if(ctx->flags & TH_ALLOCATOR_MAPPED_SHARED)
+ {
+ if((fd = open(ctx->filename, flags, (mode_t)0600)) == -1)
+ THError("unable to open file <%s> in read-write mode", ctx->filename);
+ }
+ else if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
+ {
#ifdef HAVE_SHM_OPEN
- if((fd = shm_open(ctx->filename, O_RDWR | O_CREAT, (mode_t)0600)) == -1)
- THError("unable to open file <%s> in read-write mode", ctx->filename);
+ if((fd = shm_open(ctx->filename, flags, (mode_t)0600)) == -1)
+ THError("unable to open shared memory object <%s> in read-write mode", ctx->filename);
#else
- THError("unable to open file <%s> in sharedmem mode, shm_open unavailable on this platform");
+ THError("unable to open file <%s> in sharedmem mode, shm_open unavailable on this platform", ctx->filename);
#endif
+ }
+ else
+ {
+ if((fd = open(ctx->filename, O_RDONLY)) == -1)
+ THError("unable to open file <%s> in read-only mode", ctx->filename);
+ }
+ } else {
+ fd = ctx->fd;
}
- else
- {
- if((fd = open(ctx->filename, O_RDONLY)) == -1)
- THError("unable to open file <%s> in read-only mode", ctx->filename);
- }
- if((fdsz = lseek(fd, 0, SEEK_END)) == -1)
+
+ if(fstat(fd, &file_stat) == -1)
{
- close(fd);
- THError("unable to seek at end of file <%s>", ctx->filename);
+ if (!(ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD))
+ close(fd);
+ THError("unable to stat the file <%s>", ctx->filename);
}
+
if(size > 0)
{
- if(size > fdsz)
+ if(size > file_stat.st_size)
{
- if(ctx->shared)
+ if(ctx->flags)
{
/* if it is shared mem, let's put it in correct size */
- if (ctx->shared == TH_ALLOCATOR_MAPPED_SHAREDMEM)
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
{
if(ftruncate(fd, size) == -1)
THError("unable to resize shared memory file <%s> to the right size", ctx->filename);
}
- if((fdsz = lseek(fd, size-1, SEEK_SET)) == -1)
+ if(fstat(fd, &file_stat) == -1 || file_stat.st_size < size)
{
close(fd);
THError("unable to stretch file <%s> to the right size", ctx->filename);
}
+/* on OS X write returns with errno 45 (Opperation not supported) when used
+ * with a file descriptor obtained via shm_open
+ */
+#ifndef __APPLE__
if((write(fd, "", 1)) != 1) /* note that the string "" contains the '\0' byte ... */
{
close(fd);
THError("unable to write to file <%s>", ctx->filename);
}
+#endif
}
else
{
@@ -225,18 +297,40 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
}
}
else
- size = fdsz;
+ size = file_stat.st_size;
ctx->size = size; /* if we are here, it must be the right size */
-
+
/* map it */
- if(ctx->shared)
+ if (ctx->flags & (TH_ALLOCATOR_MAPPED_SHARED | TH_ALLOCATOR_MAPPED_SHAREDMEM))
data = mmap(NULL, ctx->size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
else
data = mmap(NULL, ctx->size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
- if(close(fd) == -1)
- THError("Error closing file <%s>", ctx->filename);
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD) {
+ ctx->fd = fd;
+ } else {
+ if(close(fd) == -1)
+ THError("Error closing file <%s>", ctx->filename);
+ ctx->fd = -1;
+ }
+
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_UNLINK) {
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
+ {
+#ifdef HAVE_SHM_UNLINK
+ if (shm_unlink(ctx->filename) == -1)
+ THError("could not unlink the shared memory file %s", ctx->filename);
+#else
+ THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
+#endif
+ }
+ else
+ {
+ if (unlink(ctx->filename) == -1)
+ THError("could not unlink file %s", ctx->filename);
+ }
+ }
if(data == MAP_FAILED)
{
@@ -249,6 +343,10 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
return data;
}
+static void * THMapAllocator_alloc(void *ctx, long size) {
+ return _map_alloc(ctx, size);
+}
+
static void *THMapAllocator_realloc(void* ctx, void* ptr, long size) {
THError("cannot realloc mapped data");
return NULL;
@@ -260,26 +358,35 @@ static void THMapAllocator_free(void* ctx_, void* data) {
#ifdef _WIN32
if(!UnmapViewOfFile((LPINT)data))
THError("could not unmap the shared memory file");
-#else
+#else /* _WIN32 */
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD) {
+ if (close(ctx->fd) == -1)
+ THError("could not close file descriptor %d", ctx->fd);
+ }
+
if (munmap(data, ctx->size))
THError("could not unmap the shared memory file");
- if (ctx->shared == TH_ALLOCATOR_MAPPED_SHAREDMEM)
+
+ if (!(ctx->flags & (TH_ALLOCATOR_MAPPED_FROMFD | TH_ALLOCATOR_MAPPED_UNLINK)))
{
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
+ {
#ifdef HAVE_SHM_UNLINK
- if (shm_unlink(ctx->filename) == -1)
- THError("could not unlink the shared memory file %s", ctx->filename);
+ if (shm_unlink(ctx->filename) == -1)
+ THError("could not unlink the shared memory file %s", ctx->filename);
#else
- THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
+ THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
#endif
+ }
}
-#endif
+#endif /* _WIN32 */
THMapAllocatorContext_free(ctx);
}
#else
-THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int shared) {
+THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags) {
THError("file mapping not supported on your system");
return NULL;
}
@@ -304,8 +411,110 @@ static void THMapAllocator_free(void* ctx, void* data) {
#endif
+#if (defined(_WIN32) || defined(HAVE_MMAP)) && defined(TH_ATOMIC_IPC_REFCOUNT)
+
+static void * THRefcountedMapAllocator_alloc(void *_ctx, long size) {
+ THMapAllocatorContext *ctx = _ctx;
+
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD)
+ THError("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_FROMFD flag");
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD)
+ THError("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_KEEPFD flag");
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_UNLINK)
+ THError("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_UNLINK flag");
+ if (!(ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM))
+ THError("THRefcountedMapAllocator requires TH_ALLOCATOR_MAPPED_SHAREDMEM flag");
+
+ size = size + TH_ALLOC_ALIGNMENT;
+ void *ptr = _map_alloc(ctx, size);
+ char *data = ((char*)ptr) + TH_ALLOC_ALIGNMENT;
+ THMapInfo *map_info = (THMapInfo*)ptr;
+
+ if (ctx->flags & TH_ALLOCATOR_MAPPED_EXCLUSIVE)
+ map_info->refcount = 1;
+ else
+ THAtomicIncrementRef(&map_info->refcount);
+
+ return (void*)data;
+}
+
+static void *THRefcountedMapAllocator_realloc(void* ctx, void* ptr, long size) {
+ THError("cannot realloc mapped data");
+ return NULL;
+}
+
+static void THRefcountedMapAllocator_free(void* ctx_, void* data) {
+ THMapAllocatorContext *ctx = ctx_;
+
+#ifdef _WIN32
+ if(!UnmapViewOfFile((LPINT)data))
+ THError("could not unmap the shared memory file");
+#else /* _WIN32 */
+
+ THMapInfo *info = (THMapInfo*)(((char*)data) - TH_ALLOC_ALIGNMENT);
+ if (THAtomicDecrementRef(&info->refcount)) {
+#ifdef HAVE_SHM_UNLINK
+ if (shm_unlink(ctx->filename) == -1)
+ THError("could not unlink the shared memory file %s", ctx->filename);
+#else
+ THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
+#endif /* HAVE_SHM_UNLINK */
+ }
+ if (munmap(info, ctx->size))
+ THError("could not unmap the shared memory file %s", ctx->filename);
+#endif /* _WIN32 */
+
+ THMapAllocatorContext_free(ctx);
+}
+
+void THRefcountedMapAllocator_incref(THMapAllocatorContext *ctx, void *data)
+{
+ THMapInfo *map_info = (THMapInfo*)(((char*)data) - TH_ALLOC_ALIGNMENT);
+ THAtomicIncrementRef(&map_info->refcount);
+}
+
+int THRefcountedMapAllocator_decref(THMapAllocatorContext *ctx, void *data)
+{
+ THMapInfo *map_info = (THMapInfo*)(((char*)data) - TH_ALLOC_ALIGNMENT);
+ return THAtomicDecrementRef(&map_info->refcount);
+}
+
+#else
+
+static void * THRefcountedMapAllocator_alloc(void *ctx, long size) {
+ THError("refcounted file mapping not supported on your system");
+ return NULL;
+}
+
+static void *THRefcountedMapAllocator_realloc(void* ctx, void* ptr, long size) {
+ THError("refcounted file mapping not supported on your system");
+ return NULL;
+}
+
+static void THRefcountedMapAllocator_free(void* ctx_, void* data) {
+ THError("refcounted file mapping not supported on your system");
+}
+
+void THRefcountedMapAllocator_incref(THMapAllocatorContext *ctx, void *data)
+{
+ THError("refcounted file mapping not supported on your system");
+}
+
+int THRefcountedMapAllocator_decref(THMapAllocatorContext *ctx, void *data)
+{
+ THError("refcounted file mapping not supported on your system");
+}
+
+#endif
+
THAllocator THMapAllocator = {
&THMapAllocator_alloc,
&THMapAllocator_realloc,
&THMapAllocator_free
};
+
+THAllocator THRefcountedMapAllocator = {
+ &THRefcountedMapAllocator_alloc,
+ &THRefcountedMapAllocator_realloc,
+ &THRefcountedMapAllocator_free
+};
diff --git a/lib/TH/THAllocator.h b/lib/TH/THAllocator.h
index dbc75a8..14c433a 100644
--- a/lib/TH/THAllocator.h
+++ b/lib/TH/THAllocator.h
@@ -5,6 +5,11 @@
#define TH_ALLOCATOR_MAPPED_SHARED 1
#define TH_ALLOCATOR_MAPPED_SHAREDMEM 2
+#define TH_ALLOCATOR_MAPPED_EXCLUSIVE 4
+#define TH_ALLOCATOR_MAPPED_NOCREATE 8
+#define TH_ALLOCATOR_MAPPED_KEEPFD 16
+#define TH_ALLOCATOR_MAPPED_FROMFD 32
+#define TH_ALLOCATOR_MAPPED_UNLINK 64
/* Custom allocator
*/
@@ -22,10 +27,17 @@ extern THAllocator THDefaultAllocator;
/* file map allocator
*/
typedef struct THMapAllocatorContext_ THMapAllocatorContext;
-THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int shared);
-long THMapAllocatorContext_size(THMapAllocatorContext *ctx);
-void THMapAllocatorContext_free(THMapAllocatorContext *ctx);
+TH_API THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags);
+TH_API THMapAllocatorContext *THMapAllocatorContext_newWithFd(const char *filename,
+ int fd, int flags);
+TH_API char * THMapAllocatorContext_filename(THMapAllocatorContext *ctx);
+TH_API int THMapAllocatorContext_fd(THMapAllocatorContext *ctx);
+TH_API long THMapAllocatorContext_size(THMapAllocatorContext *ctx);
+TH_API void THMapAllocatorContext_free(THMapAllocatorContext *ctx);
+TH_API void THRefcountedMapAllocator_incref(THMapAllocatorContext *ctx, void *data);
+TH_API int THRefcountedMapAllocator_decref(THMapAllocatorContext *ctx, void *data);
extern THAllocator THMapAllocator;
+extern THAllocator THRefcountedMapAllocator;
#endif
diff --git a/lib/TH/THAtomic.h b/lib/TH/THAtomic.h
index 3a37c31..3a0b6fa 100644
--- a/lib/TH/THAtomic.h
+++ b/lib/TH/THAtomic.h
@@ -86,4 +86,11 @@ TH_API long THAtomicAddLong(long volatile *a, long value);
*/
TH_API long THAtomicCompareAndSwapLong(long volatile *a, long oldvalue, long newvalue);
+#if defined(USE_C11_ATOMICS) && defined(ATOMIC_INT_LOCK_FREE) && \
+ ATOMIC_INT_LOCK_FREE == 2
+#define TH_ATOMIC_IPC_REFCOUNT 1
+#elif defined(USE_MSC_ATOMICS) || defined(USE_GCC_ATOMICS)
+#define TH_ATOMIC_IPC_REFCOUNT 1
+#endif
+
#endif
diff --git a/lib/TH/THDiskFile.c b/lib/TH/THDiskFile.c
index dff9710..7064b7f 100644
--- a/lib/TH/THDiskFile.c
+++ b/lib/TH/THDiskFile.c
@@ -207,7 +207,7 @@ static size_t THDiskFile_position(THFile *self)
if (offset > -1)
return (size_t)offset;
else if(!dfself->file.isQuiet)
- THError("unable to obtain disk file offset (maybe a long overflow occured)");
+ THError("unable to obtain disk file offset (maybe a long overflow occurred)");
return 0;
}
diff --git a/lib/TH/THTensor.c b/lib/TH/THTensor.c
index b0ab0a5..2878fc9 100644
--- a/lib/TH/THTensor.c
+++ b/lib/TH/THTensor.c
@@ -1,6 +1,7 @@
#include "THAtomic.h"
#include "THTensor.h"
#include "THVector.h"
+
#include "THBlas.h"
#include "THLapack.h"
#include "THRandom.h"
diff --git a/lib/TH/THVector.c b/lib/TH/THVector.c
new file mode 100644
index 0000000..6179d89
--- /dev/null
+++ b/lib/TH/THVector.c
@@ -0,0 +1,17 @@
+#include "THVector.h"
+#include "generic/simd/simd.h"
+
+#ifdef __NEON__
+#include "vector/NEON.c"
+#endif
+
+#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+ || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+#include "vector/SSE.c"
+#endif
+
+#include "generic/THVectorDefault.c"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THVectorDispatch.c"
+#include "THGenerateAllTypes.h"
diff --git a/lib/TH/THVector.h b/lib/TH/THVector.h
index 1344e75..e29917b 100644
--- a/lib/TH/THVector.h
+++ b/lib/TH/THVector.h
@@ -5,570 +5,9 @@
#define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME)
-#if defined USE_SSE2 || defined USE_SSE3 || defined USE_SSSE3 \
- || defined USE_SSE4_1 || defined USE_SSE4_2
+/* We are going to use dynamic dispatch, and want only to generate declarations
+ * of the vector functions */
+#include "generic/THVector.h"
+#include "THGenerateAllTypes.h"
-#ifdef USE_SSE2
-#include <emmintrin.h>
-#endif
-
-#ifdef USE_SSE3
-#include <pmmintrin.h>
-#endif
-
-#ifdef USE_SSSE3
-#include <tmmintrin.h>
-#endif
-
-#if defined (USE_SSE4_2) || defined (USE_SSE4_1)
-#include <smmintrin.h>
-#endif
-
-#define THDoubleVector_fill(x, c, n) { \
- long i; \
- long off; \
- __m128d XMM0 = _mm_set1_pd(c); \
- for (i=0; i<=((n)-8); i+=8) { \
- _mm_storeu_pd((x)+i , XMM0); \
- _mm_storeu_pd((x)+i+2, XMM0); \
- _mm_storeu_pd((x)+i+4, XMM0); \
- _mm_storeu_pd((x)+i+6, XMM0); \
- } \
- off = (n) - ((n)%8); \
- for (i=0; i<((n)%8); i++) { \
- x[off+i] = c; \
- } \
- }
-
-
-#define THDoubleVector_add(y, x, c, n) { \
- long i = 0; \
- __m128d XMM7 = _mm_set1_pd(c); \
- __m128d XMM0,XMM2; \
- for (; i<=((n)-2); i+=2) { \
- XMM0 = _mm_loadu_pd((x)+i); \
- XMM2 = _mm_loadu_pd((y)+i); \
- XMM0 = _mm_mul_pd(XMM0, XMM7); \
- XMM2 = _mm_add_pd(XMM2, XMM0); \
- _mm_storeu_pd((y)+i , XMM2); \
- } \
- for (; i<(n); i++) { \
- y[i] += c * x[i]; \
- } \
- }
-
-#define THDoubleVector_diff(z, x, y, n) { \
- long i; \
- for (i=0; i<=((n)-8); i+=8) { \
- __m128d XMM0 = _mm_loadu_pd((x)+i ); \
- __m128d XMM1 = _mm_loadu_pd((x)+i+2); \
- __m128d XMM2 = _mm_loadu_pd((x)+i+4); \
- __m128d XMM3 = _mm_loadu_pd((x)+i+6); \
- __m128d XMM4 = _mm_loadu_pd((y)+i ); \
- __m128d XMM5 = _mm_loadu_pd((y)+i+2); \
- __m128d XMM6 = _mm_loadu_pd((y)+i+4); \
- __m128d XMM7 = _mm_loadu_pd((y)+i+6); \
- XMM0 = _mm_sub_pd(XMM0, XMM4); \
- XMM1 = _mm_sub_pd(XMM1, XMM5); \
- XMM2 = _mm_sub_pd(XMM2, XMM6); \
- XMM3 = _mm_sub_pd(XMM3, XMM7); \
- _mm_storeu_pd((z)+i , XMM0); \
- _mm_storeu_pd((z)+i+2, XMM1); \
- _mm_storeu_pd((z)+i+4, XMM2); \
- _mm_storeu_pd((z)+i+6, XMM3); \
- } \
- long off = (n) - ((n)%8); \
- for (i=0; i<((n)%8); i++) { \
- z[off+i] = x[off+i] - y[off+i]; \
- } \
- }
-
-#define THDoubleVector_scale(y, c, n) { \
- long i; \
- __m128d XMM7 = _mm_set1_pd(c); \
- for (i=0; i<=((n)-4); i+=4) { \
- __m128d XMM0 = _mm_loadu_pd((y)+i ); \
- __m128d XMM1 = _mm_loadu_pd((y)+i+2); \
- XMM0 = _mm_mul_pd(XMM0, XMM7); \
- XMM1 = _mm_mul_pd(XMM1, XMM7); \
- _mm_storeu_pd((y)+i , XMM0); \
- _mm_storeu_pd((y)+i+2, XMM1); \
- } \
- long off = (n) - ((n)%4); \
- for (i=0; i<((n)%4); i++) { \
- y[off+i] *= c; \
- } \
- }
-
-#define THDoubleVector_mul(y, x, n) { \
- long i; \
- for (i=0; i<=((n)-8); i+=8) { \
- __m128d XMM0 = _mm_loadu_pd((x)+i ); \
- __m128d XMM1 = _mm_loadu_pd((x)+i+2); \
- __m128d XMM2 = _mm_loadu_pd((x)+i+4); \
- __m128d XMM3 = _mm_loadu_pd((x)+i+6); \
- __m128d XMM4 = _mm_loadu_pd((y)+i ); \
- __m128d XMM5 = _mm_loadu_pd((y)+i+2); \
- __m128d XMM6 = _mm_loadu_pd((y)+i+4); \
- __m128d XMM7 = _mm_loadu_pd((y)+i+6); \
- XMM4 = _mm_mul_pd(XMM4, XMM0); \
- XMM5 = _mm_mul_pd(XMM5, XMM1); \
- XMM6 = _mm_mul_pd(XMM6, XMM2); \
- XMM7 = _mm_mul_pd(XMM7, XMM3); \
- _mm_storeu_pd((y)+i , XMM4); \
- _mm_storeu_pd((y)+i+2, XMM5); \
- _mm_storeu_pd((y)+i+4, XMM6); \
- _mm_storeu_pd((y)+i+6, XMM7); \
- } \
- long off = (n) - ((n)%8); \
- for (i=0; i<((n)%8); i++) { \
- y[off+i] *= x[off+i]; \
- } \
- }
-
-#define THFloatVector_fill(x, c, n) { \
- long i; \
- __m128 XMM0 = _mm_set_ps1(c); \
- long off; \
- for (i=0; i<=((n)-16); i+=16) { \
- _mm_storeu_ps((x)+i , XMM0); \
- _mm_storeu_ps((x)+i+4, XMM0); \
- _mm_storeu_ps((x)+i+8, XMM0); \
- _mm_storeu_ps((x)+i+12, XMM0); \
- } \
- off = (n) - ((n)%16); \
- for (i=0; i<((n)%16); i++) { \
- x[off+i] = c; \
- } \
- }
-
-#define THFloatVector_add(y, x, c, n) { \
- long i = 0; \
- __m128 XMM7 = _mm_set_ps1(c); \
- __m128 XMM0,XMM2; \
- for (; i<=((n)-4); i+=4) { \
- XMM0 = _mm_loadu_ps((x)+i); \
- XMM2 = _mm_loadu_ps((y)+i); \
- XMM0 = _mm_mul_ps(XMM0, XMM7); \
- XMM2 = _mm_add_ps(XMM2, XMM0); \
- _mm_storeu_ps((y)+i , XMM2); \
- } \
- for (; i<(n); i++) { \
- y[i] += c * x[i]; \
- } \
- }
-
-#define THFloatVector_diff(z, x, y, n) { \
- long i; \
- for (i=0; i<=((n)-16); i+=16) { \
- __m128 XMM0 = _mm_loadu_ps((x)+i ); \
- __m128 XMM1 = _mm_loadu_ps((x)+i+ 4); \
- __m128 XMM2 = _mm_loadu_ps((x)+i+ 8); \
- __m128 XMM3 = _mm_loadu_ps((x)+i+12); \
- __m128 XMM4 = _mm_loadu_ps((y)+i ); \
- __m128 XMM5 = _mm_loadu_ps((y)+i+ 4); \
- __m128 XMM6 = _mm_loadu_ps((y)+i+ 8); \
- __m128 XMM7 = _mm_loadu_ps((y)+i+12); \
- XMM0 = _mm_sub_ps(XMM0, XMM4); \
- XMM1 = _mm_sub_ps(XMM1, XMM5); \
- XMM2 = _mm_sub_ps(XMM2, XMM6); \
- XMM3 = _mm_sub_ps(XMM3, XMM7); \
- _mm_storeu_ps((z)+i , XMM0); \
- _mm_storeu_ps((z)+i+ 4, XMM1); \
- _mm_storeu_ps((z)+i+ 8, XMM2); \
- _mm_storeu_ps((z)+i+12, XMM3); \
- } \
- long off = (n) - ((n)%16); \
- for (i=0; i<((n)%16); i++) { \
- z[off+i] = x[off+i] - y[off+i]; \
- } \
- }
-
-#define THFloatVector_scale(y, c, n) { \
- long i; \
- __m128 XMM7 = _mm_set_ps1(c); \
- for (i=0; i<=((n)-8); i+=8) { \
- __m128 XMM0 = _mm_loadu_ps((y)+i ); \
- __m128 XMM1 = _mm_loadu_ps((y)+i+4); \
- XMM0 = _mm_mul_ps(XMM0, XMM7); \
- XMM1 = _mm_mul_ps(XMM1, XMM7); \
- _mm_storeu_ps((y)+i , XMM0); \
- _mm_storeu_ps((y)+i+4, XMM1); \
- } \
- long off = (n) - ((n)%8); \
- for (i=0; i<((n)%8); i++) { \
- y[off+i] *= c; \
- } \
- }
-
-#define THFloatVector_mul(y, x, n) { \
- long i; \
- for (i=0; i<=((n)-16); i+=16) { \
- __m128 XMM0 = _mm_loadu_ps((x)+i ); \
- __m128 XMM1 = _mm_loadu_ps((x)+i+ 4); \
- __m128 XMM2 = _mm_loadu_ps((x)+i+ 8); \
- __m128 XMM3 = _mm_loadu_ps((x)+i+12); \
- __m128 XMM4 = _mm_loadu_ps((y)+i ); \
- __m128 XMM5 = _mm_loadu_ps((y)+i+ 4); \
- __m128 XMM6 = _mm_loadu_ps((y)+i+ 8); \
- __m128 XMM7 = _mm_loadu_ps((y)+i+12); \
- XMM4 = _mm_mul_ps(XMM4, XMM0); \
- XMM5 = _mm_mul_ps(XMM5, XMM1); \
- XMM6 = _mm_mul_ps(XMM6, XMM2); \
- XMM7 = _mm_mul_ps(XMM7, XMM3); \
- _mm_storeu_ps((y)+i , XMM4); \
- _mm_storeu_ps((y)+i+ 4, XMM5); \
- _mm_storeu_ps((y)+i+ 8, XMM6); \
- _mm_storeu_ps((y)+i+12, XMM7); \
- } \
- long off = (n) - ((n)%16); \
- for (i=0; i<((n)%16); i++) { \
- y[off+i] *= x[off+i]; \
- } \
- }
-
-#elif defined __NEON__
-/* ARM NEON Assembly routine for operating on floats */
-
-#define THFloatVector_fill(x, c, n) { \
- float ctemp = c; \
- float * caddr = &ctemp; \
- __asm__ __volatile__ ( \
- "mov r0, %0 @ \n\t" \
- "ldr r4, [%1] @ \n\t" \
- "vdup.32 q12, r4 @ \n\t" \
- "vdup.32 q13, r4 @ \n\t" \
- "lsrs r4, %2, #3 @ \n\t" \
- "beq 3f @ \n\t" \
- "1: @ \n\t" \
- "vst1.32 {d24-d27}, [r0]! @ \n\t" \
- "subs r4, r4, #1 @ \n\t" \
- "bne 1b @ \n\t" \
- "3: @ \n\t" \
- "ands r4, %2, #7 @ \n\t" \
- "beq 5f @ \n\t" \
- "4: @ \n\t" \
- "subs r4, r4, #1 @ \n\t" \
- "vst1.32 {d24[0]}, [r0]! @ \n\t" \
- "bne 4b @ \n\t" \
- "5: @ " \
- : \
- :"r" (x), "r"(caddr),"r"(n) \
- : "cc", "r0", "r4", "memory", \
- "q12", \
- "d24", "d25", "d26", "d27" \
- ); \
- }
-
-#define THFloatVector_diff(z, x, y, n) { \
- __asm__ __volatile__ ( \
- "mov r0, %2 @ \n\t" \
- "mov r1, %1 @ \n\t" \
- "mov r2, %0 @ \n\t" \
- "lsrs r4, %3, #3 @ \n\t" \
- "beq 3f @ \n\t" \
- "vld1.32 {d16-d19}, [r1]! @ \n\t" \
- "vld1.32 {d0-d3}, [r0]! @ \n\t" \
- "1: @ \n\t" \
- "vsub.f32 q12, q8, q0 @ \n\t" \
- "vsub.f32 q13, q9, q1 @ \n\t" \
- "subs r4, r4, #1 @ \n\t" \
- "beq 2f @ \n\t" \
- "vld1.32 {d16-d19}, [r1]! @ \n\t" \
- "vld1.32 {d0-d3}, [r0]! @ \n\t" \
- "vst1.32 {d24-d27}, [r2]! @ \n\t" \
- "b 1b @ \n\t" \
- "2: @ \n\t" \
- "vst1.32 {d24-d27}, [r2]! @ \n\t" \
- "3: @ \n\t" \
- "ands r4, %3, #7 @ \n\t" \
- "beq 5f @ \n\t" \
- "4: @ \n\t" \
- "subs r4, r4, #1 @ \n\t" \
- "vld1.32 {d16[0]}, [r1]! @ \n\t" \
- "vld1.32 {d0[0]}, [r0]! @ \n\t" \
- "vsub.f32 d24, d16, d0 @ \n\t" \
- "vst1.32 {d24[0]}, [r2]! @ \n\t" \
- "bne 4b @ \n\t" \
- "5: @ " \
- : \
- :"r" (z), "r" (x),"r" (y), "r"(n) \
- : "cc", "r0", "r1", "r2", "r4", "memory", \
- "q0", "q1", "q8", "q9", "q12", "q13", \
- "d0", "d1", "d2", "d3", \
- "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27" \
- ); \
- }
-
-#define THFloatVector_scale(y, c, n) { \
- float ctemp = c; \
- float * caddr = &ctemp; \
- __asm__ __volatile__ ( \
- "mov r0, %0 @ \n\t" \
- "mov r2, r0 @ \n\t" \
- "ldr r5, [%1] @ \n\t" \
- "vdup.32 q14, r5 @ \n\t" \
- "lsrs r5, %2, #5 @ \n\t" \
- "beq 3f @ \n\t" \
- "vld1.32 {d0-d3}, [r0]! @ \n\t" \
- "vld1.32 {d4-d7}, [r0]! @ \n\t" \
- "vld1.32 {d8-d11}, [r0]! @ \n\t" \
- "vld1.32 {d12-d15}, [r0]! @ \n\t" \
- "1: @ \n\t" \
- "vmul.f32 q0, q0, q14 @ \n\t" \
- "vmul.f32 q1, q1, q14 @ \n\t" \
- "vmul.f32 q2, q2, q14 @ \n\t" \
- "vmul.f32 q3, q3, q14 @ \n\t" \
- "vmul.f32 q4, q4, q14 @ \n\t" \
- "vmul.f32 q5, q5, q14 @ \n\t" \
- "vmul.f32 q6, q6, q14 @ \n\t" \
- "vmul.f32 q7, q7, q14 @ \n\t" \
- "subs r5, r5, #1 @ \n\t" \
- "beq 2f @ \n\t" \
- "vst1.32 {d0-d3}, [r2]! @ \n\t" \
- "vld1.32 {d0-d3}, [r0]! @ \n\t" \
- "vst1.32 {d4-d7}, [r2]! @ \n\t" \
- "vld1.32 {d4-d7}, [r0]! @ \n\t" \
- "vst1.32 {d8-d11}, [r2]! @ \n\t" \
- "vld1.32 {d8-d11}, [r0]! @ \n\t" \
- "vst1.32 {d12-d15}, [r2]! @ \n\t" \
- "vld1.32 {d12-d15}, [r0]! @ \n\t" \
- "b 1b @ \n\t" \
- "2: @ \n\t" \
- "vst1.32 {d0-d3}, [r2]! @ \n\t" \
- "vst1.32 {d4-d7}, [r2]! @ \n\t" \
- "vst1.32 {d8-d11}, [r2]! @ \n\t" \
- "vst1.32 {d12-d15}, [r2]! @ \n\t" \
- "3: @ \n\t" \
- "lsrs r5, %2, #4 @ \n\t" \
- "ands r5, r5, #1 @ \n\t" \
- "beq 4f @ \n\t" \
- "vld1.32 {d0-d3}, [r0]! @ \n\t" \
- "vld1.32 {d4-d7}, [r0]! @ \n\t" \
- "vmul.f32 q0, q0, q14 @ \n\t" \
- "vmul.f32 q1, q1, q14 @ \n\t" \
- "vmul.f32 q2, q2, q14 @ \n\t" \
- "vmul.f32 q3, q3, q14 @ \n\t" \
- "vst1.32 {d0-d3}, [r2]! @ \n\t" \
- "vst1.32 {d4-d7}, [r2]! @ \n\t" \
- "4: @ \n\t" \
- "lsrs r5, %2, #3 @ \n\t" \
- "ands r5, r5, #1 @ \n\t" \
- "beq 5f @ \n\t" \
- "vld1.32 {d0-d3}, [r0]! @ \n\t" \
- "vmul.f32 q0, q0, q14 @ \n\t" \
- "vmul.f32 q1, q1, q14 @ \n\t" \
- "vst1.32 {d0-d3}, [r2]! @ \n\t" \
- "5: @ \n\t" \
- "ands r5, %2, #7 @ \n\t" \
- "beq 7f @ \n\t" \
- "6: @ \n\t" \
- "subs r5, r5, #1 @ \n\t" \
- "vld1.32 d0[0], [r0]! @ \n\t" \
- "vmul.f32 d0, d0, d28 @ \n\t" \
- "vst1.32 d0[0], [r2]! @ \n\t" \
- "bne 6b @ \n\t" \
- "7: @ " \
- : \
- :"r" (y), "r"(caddr),"r"(n) \
- : "cc", "r0", "r2", "r5", "memory", \
- "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14", \
- "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", \
- "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", \
- "d28", "d29" \
- ); \
- }
-
-#define THFloatVector_mul(y, x, n) { \
- __asm__ __volatile__ ( \
- "mov r0, %0 @ \n\t" \
- "mov r1, %1 @ \n\t" \
- "mov r2, r0 @ \n\t" \
- "lsrs r4, %2, #3 @ \n\t" \
- "beq 3f @ \n\t" \
- "vld1.32 {d16-d19}, [r1]! @ \n\t" \
- "vld1.32 {d0-d3}, [r0]! @ \n\t" \
- "1: @ \n\t" \
- "vmul.f32 q12, q8, q0 @ \n\t" \
- "vmul.f32 q13, q9, q1 @ \n\t" \
- "subs r4, r4, #1 @ \n\t" \
- "beq 2f @ \n\t" \
- "vld1.32 {d16-d19}, [r1]! @ \n\t" \
- "vld1.32 {d0-d3}, [r0]! @ \n\t" \
- "vst1.32 {d24-d27}, [r2]! @ \n\t" \
- "b 1b @ \n\t" \
- "2: @ \n\t" \
- "vst1.32 {d24-d27}, [r2]! @ \n\t" \
- "3: @ \n\t" \
- "ands r4, %2, #7 @ \n\t" \
- "beq 5f @ \n\t" \
- "4: @ \n\t" \
- "subs r4, r4, #1 @ \n\t" \
- "vld1.32 {d16[0]}, [r1]! @ \n\t" \
- "vld1.32 {d0[0]}, [r0]! @ \n\t" \
- "vmul.f32 q12, q8, q0 @ \n\t" \
- "vst1.32 {d24[0]}, [r2]! @ \n\t" \
- "bne 4b @ \n\t" \
- "5: @ " \
- : \
- :"r" (y),"r" (x),"r"(n) \
- : "cc", "r0", "r1", "r2", "r4", "memory", \
- "q0", "q1", "q8", "q9", "q12", "q13", \
- "d0", "d1", "d2", "d3", \
- "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27" \
- ); \
- }
-#define THFloatVector_add(y, x, c, n) { \
- float ctemp = c; \
- float * caddr = &ctemp; \
- __asm__ __volatile__ ( \
- "mov r0, %0 @ \n\t" \
- "mov r1, %1 @ \n\t" \
- "mov r2, r0 @ \n\t" \
- "ldr r5, [%2] @ \n\t" \
- "vdup.32 q14, r5 @ \n\t" \
- "lsrs r5, %3, #4 @ \n\t" \
- "beq 3f @ \n\t" \
- "vld1.32 {d16-d19}, [r1]! @ \n\t" \
- "vld1.32 {d0-d3}, [r0]! @ \n\t" \
- "vld1.32 {d20-d23}, [r1]! @ \n\t" \
- "vld1.32 {d4-d7}, [r0]! @ \n\t" \
- "1: @ \n\t" \
- "vmla.f32 q0, q8, q14 @ \n\t" \
- "vmla.f32 q1, q9, q14 @ \n\t" \
- "vmla.f32 q2, q10, q14 @ \n\t" \
- "vmla.f32 q3, q11, q14 @ \n\t" \
- "subs r5, r5, #1 @ \n\t" \
- "beq 2f @ \n\t" \
- "vld1.32 {d16-d19}, [r1]! @ \n\t" \
- "vld1.32 {d20-d23}, [r1]! @ \n\t" \
- "vst1.32 {d0-d3}, [r2]! @ \n\t" \
- "vld1.32 {d0-d3}, [r0]! @ \n\t" \
- "vst1.32 {d4-d7}, [r2]! @ \n\t" \
- "vld1.32 {d4-d7}, [r0]! @ \n\t" \
- "b 1b @ \n\t" \
- "2: @ \n\t" \
- "vst1.32 {d0-d3}, [r2]! @ \n\t" \
- "vst1.32 {d4-d7}, [r2]! @ \n\t" \
- "3: @ \n\t" \
- "lsrs r5, %3, #3 @ \n\t" \
- "ands r5, #1 @ \n\t" \
- "beq 4f @ \n\t" \
- "vld1.32 {d16-d19}, [r1]! @ \n\t" \
- "vld1.32 {d0-d3}, [r0]! @ \n\t" \
- "vmla.f32 q0, q8, q14 @ \n\t" \
- "vmla.f32 q1, q9, q14 @ \n\t" \
- "vst1.32 {d0-d3}, [r2]! @ \n\t" \
- "4: @ \n\t" \
- "ands r5, %3, #7 @ \n\t" \
- "beq 6f @ \n\t" \
- "5: @ \n\t" \
- "subs r5, r5, #1 @ \n\t" \
- "vld1.32 {d16[0]}, [r1]! @ \n\t" \
- "vld1.32 {d0[0]}, [r0]! @ \n\t" \
- "vmla.f32 d0, d16, d28 @ \n\t" \
- "vst1.32 d0[0], [r2]! @ \n\t" \
- "bne 5b @ \n\t" \
- "6: @ " \
- : \
- :"r" (y),"r" (x), "r"(caddr),"r"(n) \
- : "cc", "r0", "r1", "r2", "r5", "memory", \
- "q0", "q1", "q2", "q3", "q14", \
- "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", \
- "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29" \
- ); \
- }
-
-static inline void THDoubleVector_fill(double *x, const double c, const long n) {
- long i = 0;
-
- for(; i < n-4; i += 4)
- {
- x[i] = c;
- x[i+1] = c;
- x[i+2] = c;
- x[i+3] = c;
- }
-
- for(; i < n; i++)
- x[i] = c;
-}
-
-static inline void THDoubleVector_add(double *y, const double *x, const double c, const long n)
-{
- long i = 0;
-
- for(;i < n-4; i += 4)
- {
- y[i] += c * x[i];
- y[i+1] += c * x[i+1];
- y[i+2] += c * x[i+2];
- y[i+3] += c * x[i+3];
- }
-
- for(; i < n; i++)
- y[i] += c * x[i];
-}
-
-static inline void THDoubleVector_diff(double *z, const double *x, const double *y, const long n)
-{
- long i = 0;
-
- for(; i < n-4; i += 4)
- {
- z[i] = x[i] - y[i];
- z[i+1] = x[i+1] - y[i+1];
- z[i+2] = x[i+2] - y[i+2];
- z[i+3] = x[i+3] - y[i+3];
- }
-
- for(; i < n; i++)
- z[i] = x[i] - y[i];
-}
-
-static inline void THDoubleVector_scale(double *y, const double c, const long n)
-{
- long i = 0;
-
- for(; i < n-4; i +=4)
- {
- y[i] *= c;
- y[i+1] *= c;
- y[i+2] *= c;
- y[i+3] *= c;
- }
-
- for(; i < n; i++)
- y[i] *= c;
-}
-
-static inline void THDoubleVector_mul(double *y, const double *x, const long n)
-{
- long i = 0;
-
- for(; i < n-4; i += 4)
- {
- y[i] *= x[i];
- y[i+1] *= x[i+1];
- y[i+2] *= x[i+2];
- y[i+3] *= x[i+3];
- }
-
- for(; i < n; i++)
- y[i] *= x[i];
-}
-
-
-#else
-
-/* If SSE2 not defined, then generate plain C operators */
-#include "generic/THVector.c"
-#include "THGenerateFloatTypes.h"
-
-#endif
-
-/* For non-float types, generate plain C operators */
-#include "generic/THVector.c"
-#include "THGenerateIntTypes.h"
-
-#endif
+#endif // TH_VECTOR_INC
diff --git a/lib/TH/generic/THStorage.c b/lib/TH/generic/THStorage.c
index cac043e..788f6c7 100644
--- a/lib/TH/generic/THStorage.c
+++ b/lib/TH/generic/THStorage.c
@@ -41,9 +41,9 @@ THStorage* THStorage_(newWithAllocator)(long size,
return storage;
}
-THStorage* THStorage_(newWithMapping)(const char *filename, long size, int shared)
+THStorage* THStorage_(newWithMapping)(const char *filename, long size, int flags)
{
- THMapAllocatorContext *ctx = THMapAllocatorContext_new(filename, shared);
+ THMapAllocatorContext *ctx = THMapAllocatorContext_new(filename, flags);
THStorage *storage = THStorage_(newWithAllocator)(size,
&THMapAllocator,
@@ -203,4 +203,24 @@ real THStorage_(get)(const THStorage *self, long idx)
return self->data[idx];
}
+void THStorage_(swap)(THStorage *storage1, THStorage *storage2)
+{
+#define SWAP(val) { val = storage1->val; storage1->val = storage2->val; storage2->val = val; }
+ real *data;
+ long size;
+ char flag;
+ THAllocator *allocator;
+ void *allocatorContext;
+ struct THStorage *view;
+
+ SWAP(data);
+ SWAP(size);
+ SWAP(flag);
+ // don't swap refcount!
+ SWAP(allocator);
+ SWAP(allocatorContext);
+ SWAP(view);
+#undef SWAP
+}
+
#endif
diff --git a/lib/TH/generic/THStorage.h b/lib/TH/generic/THStorage.h
index 79013d8..0f6dcca 100644
--- a/lib/TH/generic/THStorage.h
+++ b/lib/TH/generic/THStorage.h
@@ -46,7 +46,7 @@ TH_API THStorage* THStorage_(newWithSize1)(real);
TH_API THStorage* THStorage_(newWithSize2)(real, real);
TH_API THStorage* THStorage_(newWithSize3)(real, real, real);
TH_API THStorage* THStorage_(newWithSize4)(real, real, real, real);
-TH_API THStorage* THStorage_(newWithMapping)(const char *filename, long size, int shared);
+TH_API THStorage* THStorage_(newWithMapping)(const char *filename, long size, int flags);
/* takes ownership of data */
TH_API THStorage* THStorage_(newWithData)(real *data, long size);
@@ -61,6 +61,7 @@ TH_API THStorage* THStorage_(newWithDataAndAllocator)(
TH_API void THStorage_(setFlag)(THStorage *storage, const char flag);
TH_API void THStorage_(clearFlag)(THStorage *storage, const char flag);
TH_API void THStorage_(retain)(THStorage *storage);
+TH_API void THStorage_(swap)(THStorage *storage1, THStorage *storage2);
/* might differ with other API (like CUDA) */
TH_API void THStorage_(free)(THStorage *storage);
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index c3da469..cae5959 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -823,8 +823,6 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
THTensor_(copy)(r_, t);
}
-/* printf("%ldx%ld = %ldx%ld X %ldx%ld\n", r_->size[0], r_->size[1], m1->size[0], m1->size[1], m2->size[0], m2->size[1]); */
-
/* r_ */
if(r_->stride[0] == 1 &&
r_->stride[1] != 0)
@@ -1937,7 +1935,7 @@ void THTensor_(tril)(THTensor *r_, THTensor *t, long k)
for(r = 0; r < t_size_0; r++)
{
long sz = THMin(r+k+1, t_size_1);
- for(c = THMax(0, r+k); c < t_size_1; c++)
+ for(c = THMax(0, r+k+1); c < t_size_1; c++)
r__data[r*r__stride_0+c*r__stride_1] = 0;
for(c = 0; c < sz; c++)
r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
@@ -2066,30 +2064,26 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, real value) \
{ \
THByteTensor_rawResize(r_, t->nDimension, t->size, NULL); \
- THByteTensor_zero(r_); \
TH_TENSOR_APPLY2(unsigned char, r_, real, t, \
- if (*t_data OP value) *r__data = 1;); \
+ *r__data = (*t_data OP value) ? 1 : 0;); \
} \
void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, real value) \
{ \
THTensor_(rawResize)(r_, t->nDimension, t->size, NULL); \
- THTensor_(zero)(r_); \
TH_TENSOR_APPLY2(real, r_, real, t, \
- if (*t_data OP value) *r__data = 1;); \
+ *r__data = (*t_data OP value) ? 1 : 0;); \
} \
void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
{ \
THByteTensor_rawResize(r_, ta->nDimension, ta->size, NULL); \
- THByteTensor_zero(r_); \
TH_TENSOR_APPLY3(unsigned char, r_, real, ta, real, tb, \
- if(*ta_data OP *tb_data) *r__data = 1;); \
+ *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
} \
void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
{ \
THTensor_(rawResize)(r_, ta->nDimension, ta->size, NULL); \
- THTensor_(zero)(r_); \
TH_TENSOR_APPLY3(real, r_, real, ta, real, tb, \
- if(*ta_data OP *tb_data) *r__data = 1;); \
+ *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
} \
diff --git a/lib/TH/generic/THTensorRandom.c b/lib/TH/generic/THTensorRandom.c
index f8097c8..514d3dd 100644
--- a/lib/TH/generic/THTensorRandom.c
+++ b/lib/TH/generic/THTensorRandom.c
@@ -119,7 +119,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
THArgCheckWithCleanup((sum > 0), THCleanup(THDoubleTensor_free(cum_dist);), 2,
"invalid multinomial distribution (sum of probabilities <= 0)");
/* normalize cumulative probability distribution so that last val is 1
- i.e. dosen't assume original prob_dist row sums to one */
+ i.e. doesn't assume original prob_dist row sums to one */
if ( (sum > 0) || ( ( sum < 1.00001) && (sum > 0.99999) ) )
{
for (j=0; j<n_categories; j++)
diff --git a/lib/TH/generic/THVector.h b/lib/TH/generic/THVector.h
new file mode 100644
index 0000000..09067e5
--- /dev/null
+++ b/lib/TH/generic/THVector.h
@@ -0,0 +1,14 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THVector.h"
+#else
+
+TH_API void THVector_(fill)(real *x, const real c, const long n);
+TH_API void THVector_(add)(real *y, const real *x, const real c, const long n);
+TH_API void THVector_(diff)(real *z, const real *x, const real *y, const long n);
+TH_API void THVector_(scale)(real *y, const real c, const long n);
+TH_API void THVector_(mul)(real *y, const real *x, const long n);
+
+/* Initialize the dispatch pointers */
+TH_API void THVector_(vectorDispatchInit)();
+
+#endif
diff --git a/lib/TH/generic/THVector.c b/lib/TH/generic/THVectorDefault.c
similarity index 67%
rename from lib/TH/generic/THVector.c
rename to lib/TH/generic/THVectorDefault.c
index 6c8a96b..d51be03 100644
--- a/lib/TH/generic/THVector.c
+++ b/lib/TH/generic/THVectorDefault.c
@@ -1,8 +1,8 @@
#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THVector.c"
+#define TH_GENERIC_FILE "generic/THVectorDefault.c"
#else
-static TH_INLINE void THVector_(fill)(real *x, const real c, const long n) {
+void THVector_(fill_DEFAULT)(real *x, const real c, const long n) {
long i = 0;
for(; i < n-4; i += 4)
@@ -17,7 +17,7 @@ static TH_INLINE void THVector_(fill)(real *x, const real c, const long n) {
x[i] = c;
}
-static TH_INLINE void THVector_(add)(real *y, const real *x, const real c, const long n)
+void THVector_(add_DEFAULT)(real *y, const real *x, const real c, const long n)
{
long i = 0;
@@ -33,7 +33,7 @@ static TH_INLINE void THVector_(add)(real *y, const real *x, const real c, const
y[i] += c * x[i];
}
-static TH_INLINE void THVector_(diff)(real *z, const real *x, const real *y, const long n)
+void THVector_(diff_DEFAULT)(real *z, const real *x, const real *y, const long n)
{
long i = 0;
@@ -49,7 +49,7 @@ static TH_INLINE void THVector_(diff)(real *z, const real *x, const real *y, con
z[i] = x[i] - y[i];
}
-static TH_INLINE void THVector_(scale)(real *y, const real c, const long n)
+void THVector_(scale_DEFAULT)(real *y, const real c, const long n)
{
long i = 0;
@@ -65,7 +65,7 @@ static TH_INLINE void THVector_(scale)(real *y, const real c, const long n)
y[i] *= c;
}
-static TH_INLINE void THVector_(mul)(real *y, const real *x, const long n)
+void THVector_(mul_DEFAULT)(real *y, const real *x, const long n)
{
long i = 0;
diff --git a/lib/TH/generic/THVectorDispatch.c b/lib/TH/generic/THVectorDispatch.c
new file mode 100644
index 0000000..f16bcda
--- /dev/null
+++ b/lib/TH/generic/THVectorDispatch.c
@@ -0,0 +1,140 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THVectorDispatch.c"
+#else
+
+/* For now there are only SIMD implementations for FLOAT and DOUBLE.
+ * Hopefully in the future this can be made totally generic (e.g, there are SIMD implementations
+ * for a lot of functions */
+/* Each function with multiple implementations has:
+ * 1. A DISPATCHPTR which will be initialized to point to the best available implementation for the host
+ * 2. A DISPATCHTABLE which holds pointers to each implementation of a function, and a value indicating
+ * which SIMD extension a given implementation uses
+ * 3. A dispatch stub, which is what is actually called by clients, that simply wraps the dispatch pointer.
+ */
+
+static void (*THVector_(fill_DISPATCHPTR))(real *, const real, const long) = &THVector_(fill_DEFAULT);
+static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = {
+ #if defined(__NEON__)
+ #if defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(fill_NEON), SIMDExtension_NEON);
+ #endif
+ #endif
+
+ #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+ || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(fill_SSE), SIMDExtension_SSE),
+ #endif
+ #endif
+ FUNCTION_IMPL(THVector_(fill_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(fill)(real *x, const real c, const long n) {
+ THVector_(fill_DISPATCHPTR)(x, c, n);
+}
+
+
+static void (*THVector_(add_DISPATCHPTR))(real *, const real *, const real, const long) = &THVector_(add_DEFAULT);
+static FunctionDescription THVector_(add_DISPATCHTABLE)[] = {
+ #if defined(__NEON__)
+ #if defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(add_NEON), SIMDExtension_NEON);
+ #endif
+ #endif
+
+ #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+ || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(add_SSE), SIMDExtension_SSE),
+ #endif
+ #endif
+
+ FUNCTION_IMPL(THVector_(add_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(add)(real *y, const real *x, const real c, const long n) {
+ THVector_(add_DISPATCHPTR)(y, x, c, n);
+}
+
+
+static void (*THVector_(diff_DISPATCHPTR))(real *, const real *, const real *, const long) = &THVector_(diff_DEFAULT);
+static FunctionDescription THVector_(diff_DISPATCHTABLE)[] = {
+ #if defined(__NEON__)
+ #if defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(diff_NEON), SIMDExtension_NEON);
+ #endif
+ #endif
+
+ #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+ || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(diff_SSE), SIMDExtension_SSE),
+ #endif
+ #endif
+
+ FUNCTION_IMPL(THVector_(diff_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(diff)(real *z, const real *x, const real *y, const long n) {
+ THVector_(diff_DISPATCHPTR)(z, x, y, n);
+}
+
+
+static void (*THVector_(scale_DISPATCHPTR))(real *, const real, const long) = &THVector_(scale_DEFAULT);
+static FunctionDescription THVector_(scale_DISPATCHTABLE)[] = {
+ #if defined(__NEON__)
+ #if defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(scale_NEON), SIMDExtension_NEON);
+ #endif
+ #endif
+
+ #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+ || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(scale_SSE), SIMDExtension_SSE),
+ #endif
+ #endif
+
+ FUNCTION_IMPL(THVector_(scale_DEFAULT), SIMDExtension_DEFAULT)
+};
+TH_API void THVector_(scale)(real *y, const real c, const long n) {
+ THVector_(scale_DISPATCHPTR)(y, c, n);
+}
+
+
+static void (*THVector_(mul_DISPATCHPTR))(real *, const real *, const long) = &THVector_(mul_DEFAULT);
+static FunctionDescription THVector_(mul_DISPATCHTABLE)[] = {
+ #if defined(__NEON__)
+ #if defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(mul_NEON), SIMDExtension_NEON);
+ #endif
+ #endif
+
+ #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+ || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(mul_SSE), SIMDExtension_SSE),
+ #endif
+ #endif
+
+ FUNCTION_IMPL(THVector_(mul_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(mul)(real *y, const real *x, const long n) {
+ THVector_(mul_DISPATCHPTR);
+}
+
+/* This needs to be called in order to initialize the dispatch pointers at runtime.
+ * This function simply checks what SIMD extensions are available, and then walks the dispatch table
+ * to choose the best function.
+ * NOTE: As implemented, it will initialize the dispatch pointer to the first supported function.
+ * This means that in the dispatch tables, implementations supporting more recent extensions
+ * need to come first
+ */
+void THVector_(vectorDispatchInit)()
+{
+ uint32_t hostSimdExts = detectHostSIMDExtensions();
+ INIT_DISPATCH_PTR(fill);
+ INIT_DISPATCH_PTR(add);
+ INIT_DISPATCH_PTR(diff);
+ INIT_DISPATCH_PTR(scale);
+ INIT_DISPATCH_PTR(mul);
+}
+
+#endif
diff --git a/lib/TH/generic/simd/simd.h b/lib/TH/generic/simd/simd.h
new file mode 100644
index 0000000..e4660b1
--- /dev/null
+++ b/lib/TH/generic/simd/simd.h
@@ -0,0 +1,91 @@
+#ifndef TH_SIMD_INC
+#define TH_SIMD_INC
+
+#include <stdint.h>
+
+// Can be found on Intel ISA Reference for CPUID
+#define CPUID_AVX2_BIT 0x10 // Bit 5 of EBX for EAX=0x7
+#define CPUID_AVX_BIT 0x10000000 // Bit 28 of ECX for EAX=0x1
+#define CPUID_SSE_BIT 0x2000000 // bit 25 of EDX for EAX=0x1
+
+// Helper macros for initialization
+#define FUNCTION_IMPL(NAME, EXT) \
+ { .function=(void *)NAME, \
+ .supportedSimdExt=EXT \
+ }
+
+#define INIT_DISPATCH_PTR(OP) \
+ do { \
+ int i; \
+ for (i = 0; i < sizeof(THVector_(OP ## _DISPATCHTABLE)) / sizeof(FunctionDescription); ++i) { \
+ THVector_(OP ## _DISPATCHPTR) = THVector_(OP ## _DISPATCHTABLE)[i].function; \
+ if (THVector_(OP ## _DISPATCHTABLE)[i].supportedSimdExt & hostSimdExts) { \
+ break; \
+ } \
+ } \
+ } while(0)
+
+
+typedef struct FunctionDescription
+{
+ void *function;
+ uint32_t supportedSimdExt;
+} FunctionDescription;
+
+
+enum SIMDExtensions
+{
+#if defined(__NEON__)
+ SIMDExtension_NEON = 0x1,
+#else
+ SIMDExtension_AVX2 = 0x1,
+ SIMDExtension_AVX = 0x2,
+ SIMDExtension_SSE = 0x4,
+#endif
+ SIMDExtension_DEFAULT = 0x0
+};
+
+#if defined(__NEON__)
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+ return SIMDExtension_NEON;
+}
+
+#else // x86
+
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+ uint32_t a = *eax, b, c, d;
+ asm volatile ( "cpuid\n\t"
+ : "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
+ *eax = a;
+ *ebx = b;
+ *ecx = c;
+ *edx = d;
+}
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+ uint32_t eax, ebx, ecx, edx;
+ uint32_t hostSimdExts = 0x0;
+
+ // Check for AVX2. Requires separate CPUID
+ eax = 0x7;
+ cpuid(&eax, &ebx, &ecx, &edx);
+ if (ebx & CPUID_AVX2_BIT)
+ hostSimdExts |= SIMDExtension_AVX2;
+
+ eax = 0x1;
+ cpuid(&eax, &ebx, &ecx, &edx);
+ if (ecx & CPUID_AVX_BIT)
+ hostSimdExts |= SIMDExtension_AVX;
+ if (edx & CPUID_SSE_BIT)
+ hostSimdExts |= SIMDExtension_SSE;
+
+ return hostSimdExts;
+}
+
+#endif // end x86 SIMD extension detection code
+
+#endif
diff --git a/lib/TH/vector/NEON.c b/lib/TH/vector/NEON.c
new file mode 100644
index 0000000..9d65550
--- /dev/null
+++ b/lib/TH/vector/NEON.c
@@ -0,0 +1,252 @@
+static void THFloatVector_fill_NEON(float *x, const float c, const long n) {
+ float ctemp = c;
+ float * caddr = &ctemp;
+ __asm__ __volatile__ (
+ "mov r0, %0 @ \n\t"
+ "ldr r4, [%1] @ \n\t"
+ "vdup.32 q12, r4 @ \n\t"
+ "vdup.32 q13, r4 @ \n\t"
+ "lsrs r4, %2, #3 @ \n\t"
+ "beq 3f @ \n\t"
+ "1: @ \n\t"
+ "vst1.32 {d24-d27}, [r0]! @ \n\t"
+ "subs r4, r4, #1 @ \n\t"
+ "bne 1b @ \n\t"
+ "3: @ \n\t"
+ "ands r4, %2, #7 @ \n\t"
+ "beq 5f @ \n\t"
+ "4: @ \n\t"
+ "subs r4, r4, #1 @ \n\t"
+ "vst1.32 {d24[0]}, [r0]! @ \n\t"
+ "bne 4b @ \n\t"
+ "5: @ "
+ :
+ :"r" (x), "r"(caddr),"r"(n)
+ : "cc", "r0", "r4", "memory",
+ "q12",
+ "d24", "d25", "d26", "d27"
+ );
+}
+
+
+static void THFloatVector_diff_NEON(float *y, const float *x, const float c, const long n) {
+ __asm__ __volatile__ (
+ "mov r0, %2 @ \n\t"
+ "mov r1, %1 @ \n\t"
+ "mov r2, %0 @ \n\t"
+ "lsrs r4, %3, #3 @ \n\t"
+ "beq 3f @ \n\t"
+ "vld1.32 {d16-d19}, [r1]! @ \n\t"
+ "vld1.32 {d0-d3}, [r0]! @ \n\t"
+ "1: @ \n\t"
+ "vsub.f32 q12, q8, q0 @ \n\t"
+ "vsub.f32 q13, q9, q1 @ \n\t"
+ "subs r4, r4, #1 @ \n\t"
+ "beq 2f @ \n\t"
+ "vld1.32 {d16-d19}, [r1]! @ \n\t"
+ "vld1.32 {d0-d3}, [r0]! @ \n\t"
+ "vst1.32 {d24-d27}, [r2]! @ \n\t"
+ "b 1b @ \n\t"
+ "2: @ \n\t"
+ "vst1.32 {d24-d27}, [r2]! @ \n\t"
+ "3: @ \n\t"
+ "ands r4, %3, #7 @ \n\t"
+ "beq 5f @ \n\t"
+ "4: @ \n\t"
+ "subs r4, r4, #1 @ \n\t"
+ "vld1.32 {d16[0]}, [r1]! @ \n\t"
+ "vld1.32 {d0[0]}, [r0]! @ \n\t"
+ "vsub.f32 d24, d16, d0 @ \n\t"
+ "vst1.32 {d24[0]}, [r2]! @ \n\t"
+ "bne 4b @ \n\t"
+ "5: @ "
+ :
+ :"r" (z), "r" (x),"r" (y), "r"(n)
+ : "cc", "r0", "r1", "r2", "r4", "memory",
+ "q0", "q1", "q8", "q9", "q12", "q13",
+ "d0", "d1", "d2", "d3",
+ "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
+ );
+}
+
+
+static void THFloatVector_scale_NEON(float *y, const float c, const long n) {
+ float ctemp = c;
+ float * caddr = &ctemp;
+ __asm__ __volatile__ (
+ "mov r0, %0 @ \n\t"
+ "mov r2, r0 @ \n\t"
+ "ldr r5, [%1] @ \n\t"
+ "vdup.32 q14, r5 @ \n\t"
+ "lsrs r5, %2, #5 @ \n\t"
+ "beq 3f @ \n\t"
+ "vld1.32 {d0-d3}, [r0]! @ \n\t"
+ "vld1.32 {d4-d7}, [r0]! @ \n\t"
+ "vld1.32 {d8-d11}, [r0]! @ \n\t"
+ "vld1.32 {d12-d15}, [r0]! @ \n\t"
+ "1: @ \n\t"
+ "vmul.f32 q0, q0, q14 @ \n\t"
+ "vmul.f32 q1, q1, q14 @ \n\t"
+ "vmul.f32 q2, q2, q14 @ \n\t"
+ "vmul.f32 q3, q3, q14 @ \n\t"
+ "vmul.f32 q4, q4, q14 @ \n\t"
+ "vmul.f32 q5, q5, q14 @ \n\t"
+ "vmul.f32 q6, q6, q14 @ \n\t"
+ "vmul.f32 q7, q7, q14 @ \n\t"
+ "subs r5, r5, #1 @ \n\t"
+ "beq 2f @ \n\t"
+ "vst1.32 {d0-d3}, [r2]! @ \n\t"
+ "vld1.32 {d0-d3}, [r0]! @ \n\t"
+ "vst1.32 {d4-d7}, [r2]! @ \n\t"
+ "vld1.32 {d4-d7}, [r0]! @ \n\t"
+ "vst1.32 {d8-d11}, [r2]! @ \n\t"
+ "vld1.32 {d8-d11}, [r0]! @ \n\t"
+ "vst1.32 {d12-d15}, [r2]! @ \n\t"
+ "vld1.32 {d12-d15}, [r0]! @ \n\t"
+ "b 1b @ \n\t"
+ "2: @ \n\t"
+ "vst1.32 {d0-d3}, [r2]! @ \n\t"
+ "vst1.32 {d4-d7}, [r2]! @ \n\t"
+ "vst1.32 {d8-d11}, [r2]! @ \n\t"
+ "vst1.32 {d12-d15}, [r2]! @ \n\t"
+ "3: @ \n\t"
+ "lsrs r5, %2, #4 @ \n\t"
+ "ands r5, r5, #1 @ \n\t"
+ "beq 4f @ \n\t"
+ "vld1.32 {d0-d3}, [r0]! @ \n\t"
+ "vld1.32 {d4-d7}, [r0]! @ \n\t"
+ "vmul.f32 q0, q0, q14 @ \n\t"
+ "vmul.f32 q1, q1, q14 @ \n\t"
+ "vmul.f32 q2, q2, q14 @ \n\t"
+ "vmul.f32 q3, q3, q14 @ \n\t"
+ "vst1.32 {d0-d3}, [r2]! @ \n\t"
+ "vst1.32 {d4-d7}, [r2]! @ \n\t"
+ "4: @ \n\t"
+ "lsrs r5, %2, #3 @ \n\t"
+ "ands r5, r5, #1 @ \n\t"
+ "beq 5f @ \n\t"
+ "vld1.32 {d0-d3}, [r0]! @ \n\t"
+ "vmul.f32 q0, q0, q14 @ \n\t"
+ "vmul.f32 q1, q1, q14 @ \n\t"
+ "vst1.32 {d0-d3}, [r2]! @ \n\t"
+ "5: @ \n\t"
+ "ands r5, %2, #7 @ \n\t"
+ "beq 7f @ \n\t"
+ "6: @ \n\t"
+ "subs r5, r5, #1 @ \n\t"
+ "vld1.32 d0[0], [r0]! @ \n\t"
+ "vmul.f32 d0, d0, d28 @ \n\t"
+ "vst1.32 d0[0], [r2]! @ \n\t"
+ "bne 6b @ \n\t"
+ "7: @ "
+ :
+ :"r" (y), "r"(caddr),"r"(n)
+ : "cc", "r0", "r2", "r5", "memory",
+ "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+ "d28", "d29"
+ );
+
+}
+
+static void THFloatVector_mul_NEON(float *y, const float *x, const long n) {
+ __asm__ __volatile__ (
+ "mov r0, %0 @ \n\t"
+ "mov r1, %1 @ \n\t"
+ "mov r2, r0 @ \n\t"
+ "lsrs r4, %2, #3 @ \n\t"
+ "beq 3f @ \n\t"
+ "vld1.32 {d16-d19}, [r1]! @ \n\t"
+ "vld1.32 {d0-d3}, [r0]! @ \n\t"
+ "1: @ \n\t"
+ "vmul.f32 q12, q8, q0 @ \n\t"
+ "vmul.f32 q13, q9, q1 @ \n\t"
+ "subs r4, r4, #1 @ \n\t"
+ "beq 2f @ \n\t"
+ "vld1.32 {d16-d19}, [r1]! @ \n\t"
+ "vld1.32 {d0-d3}, [r0]! @ \n\t"
+ "vst1.32 {d24-d27}, [r2]! @ \n\t"
+ "b 1b @ \n\t"
+ "2: @ \n\t"
+ "vst1.32 {d24-d27}, [r2]! @ \n\t"
+ "3: @ \n\t"
+ "ands r4, %2, #7 @ \n\t"
+ "beq 5f @ \n\t"
+ "4: @ \n\t"
+ "subs r4, r4, #1 @ \n\t"
+ "vld1.32 {d16[0]}, [r1]! @ \n\t"
+ "vld1.32 {d0[0]}, [r0]! @ \n\t"
+ "vmul.f32 q12, q8, q0 @ \n\t"
+ "vst1.32 {d24[0]}, [r2]! @ \n\t"
+ "bne 4b @ \n\t"
+ "5: @ "
+ :
+ :"r" (y),"r" (x),"r"(n)
+ : "cc", "r0", "r1", "r2", "r4", "memory",
+ "q0", "q1", "q8", "q9", "q12", "q13",
+ "d0", "d1", "d2", "d3",
+ "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
+ );
+}
+
+static void THFloatVector_add_NEON(float *y, const float *x, const float c, const long n) {
+ float ctemp = c;
+ float * caddr = &ctemp;
+ __asm__ __volatile__ (
+ "mov r0, %0 @ \n\t"
+ "mov r1, %1 @ \n\t"
+ "mov r2, r0 @ \n\t"
+ "ldr r5, [%2] @ \n\t"
+ "vdup.32 q14, r5 @ \n\t"
+ "lsrs r5, %3, #4 @ \n\t"
+ "beq 3f @ \n\t"
+ "vld1.32 {d16-d19}, [r1]! @ \n\t"
+ "vld1.32 {d0-d3}, [r0]! @ \n\t"
+ "vld1.32 {d20-d23}, [r1]! @ \n\t"
+ "vld1.32 {d4-d7}, [r0]! @ \n\t"
+ "1: @ \n\t"
+ "vmla.f32 q0, q8, q14 @ \n\t"
+ "vmla.f32 q1, q9, q14 @ \n\t"
+ "vmla.f32 q2, q10, q14 @ \n\t"
+ "vmla.f32 q3, q11, q14 @ \n\t"
+ "subs r5, r5, #1 @ \n\t"
+ "beq 2f @ \n\t"
+ "vld1.32 {d16-d19}, [r1]! @ \n\t"
+ "vld1.32 {d20-d23}, [r1]! @ \n\t"
+ "vst1.32 {d0-d3}, [r2]! @ \n\t"
+ "vld1.32 {d0-d3}, [r0]! @ \n\t"
+ "vst1.32 {d4-d7}, [r2]! @ \n\t"
+ "vld1.32 {d4-d7}, [r0]! @ \n\t"
+ "b 1b @ \n\t"
+ "2: @ \n\t"
+ "vst1.32 {d0-d3}, [r2]! @ \n\t"
+ "vst1.32 {d4-d7}, [r2]! @ \n\t"
+ "3: @ \n\t"
+ "lsrs r5, %3, #3 @ \n\t"
+ "ands r5, #1 @ \n\t"
+ "beq 4f @ \n\t"
+ "vld1.32 {d16-d19}, [r1]! @ \n\t"
+ "vld1.32 {d0-d3}, [r0]! @ \n\t"
+ "vmla.f32 q0, q8, q14 @ \n\t"
+ "vmla.f32 q1, q9, q14 @ \n\t"
+ "vst1.32 {d0-d3}, [r2]! @ \n\t"
+ "4: @ \n\t"
+ "ands r5, %3, #7 @ \n\t"
+ "beq 6f @ \n\t"
+ "5: @ \n\t"
+ "subs r5, r5, #1 @ \n\t"
+ "vld1.32 {d16[0]}, [r1]! @ \n\t"
+ "vld1.32 {d0[0]}, [r0]! @ \n\t"
+ "vmla.f32 d0, d16, d28 @ \n\t"
+ "vst1.32 d0[0], [r2]! @ \n\t"
+ "bne 5b @ \n\t"
+ "6: @ "
+ :
+ :"r" (y),"r" (x), "r"(caddr),"r"(n)
+ : "cc", "r0", "r1", "r2", "r5", "memory",
+ "q0", "q1", "q2", "q3", "q14",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29"
+ );
+}
diff --git a/lib/TH/vector/SSE.c b/lib/TH/vector/SSE.c
new file mode 100644
index 0000000..f909907
--- /dev/null
+++ b/lib/TH/vector/SSE.c
@@ -0,0 +1,213 @@
+#include <x86intrin.h>
+
+
+static void THDoubleVector_fill_SSE(double *x, const double c, const long n) {
+ long i;
+ long off;
+ __m128d XMM0 = _mm_set1_pd(c);
+ for (i=0; i<=((n)-8); i+=8) {
+ _mm_storeu_pd((x)+i , XMM0);
+ _mm_storeu_pd((x)+i+2, XMM0);
+ _mm_storeu_pd((x)+i+4, XMM0);
+ _mm_storeu_pd((x)+i+6, XMM0);
+ }
+ off = (n) - ((n)%8);
+ for (i=0; i<((n)%8); i++) {
+ x[off+i] = c;
+ }
+}
+
+
+static void THDoubleVector_add_SSE(double *y, const double *x, const double c, const long n) {
+ long i = 0;
+ __m128d XMM7 = _mm_set1_pd(c);
+ __m128d XMM0,XMM2;
+ for (; i<=((n)-2); i+=2) {
+ XMM0 = _mm_loadu_pd((x)+i);
+ XMM2 = _mm_loadu_pd((y)+i);
+ XMM0 = _mm_mul_pd(XMM0, XMM7);
+ XMM2 = _mm_add_pd(XMM2, XMM0);
+ _mm_storeu_pd((y)+i , XMM2);
+ }
+ for (; i<(n); i++) {
+ y[i] += c * x[i];
+ }
+}
+
+
+static void THDoubleVector_diff_SSE(double *z, const double *x, const double *y, const long n) {
+ long i;
+ for (i=0; i<=((n)-8); i+=8) {
+ __m128d XMM0 = _mm_loadu_pd((x)+i );
+ __m128d XMM1 = _mm_loadu_pd((x)+i+2);
+ __m128d XMM2 = _mm_loadu_pd((x)+i+4);
+ __m128d XMM3 = _mm_loadu_pd((x)+i+6);
+ __m128d XMM4 = _mm_loadu_pd((y)+i );
+ __m128d XMM5 = _mm_loadu_pd((y)+i+2);
+ __m128d XMM6 = _mm_loadu_pd((y)+i+4);
+ __m128d XMM7 = _mm_loadu_pd((y)+i+6);
+ XMM0 = _mm_sub_pd(XMM0, XMM4);
+ XMM1 = _mm_sub_pd(XMM1, XMM5);
+ XMM2 = _mm_sub_pd(XMM2, XMM6);
+ XMM3 = _mm_sub_pd(XMM3, XMM7);
+ _mm_storeu_pd((z)+i , XMM0);
+ _mm_storeu_pd((z)+i+2, XMM1);
+ _mm_storeu_pd((z)+i+4, XMM2);
+ _mm_storeu_pd((z)+i+6, XMM3);
+ }
+ long off = (n) - ((n)%8);
+ for (i=0; i<((n)%8); i++) {
+ z[off+i] = x[off+i] - y[off+i];
+ }
+}
+
+
+static void THDoubleVector_scale_SSE(double *y, const double c, const long n) {
+ long i;
+ __m128d XMM7 = _mm_set1_pd(c);
+ for (i=0; i<=((n)-4); i+=4) {
+ __m128d XMM0 = _mm_loadu_pd((y)+i );
+ __m128d XMM1 = _mm_loadu_pd((y)+i+2);
+ XMM0 = _mm_mul_pd(XMM0, XMM7);
+ XMM1 = _mm_mul_pd(XMM1, XMM7);
+ _mm_storeu_pd((y)+i , XMM0);
+ _mm_storeu_pd((y)+i+2, XMM1);
+ }
+ long off = (n) - ((n)%4);
+ for (i=0; i<((n)%4); i++) {
+ y[off+i] *= c;
+ }
+}
+
+
+static void THDoubleVector_mul_SSE(double *y, const double *x, const long n) {
+ long i;
+ for (i=0; i<=((n)-8); i+=8) {
+ __m128d XMM0 = _mm_loadu_pd((x)+i );
+ __m128d XMM1 = _mm_loadu_pd((x)+i+2);
+ __m128d XMM2 = _mm_loadu_pd((x)+i+4);
+ __m128d XMM3 = _mm_loadu_pd((x)+i+6);
+ __m128d XMM4 = _mm_loadu_pd((y)+i );
+ __m128d XMM5 = _mm_loadu_pd((y)+i+2);
+ __m128d XMM6 = _mm_loadu_pd((y)+i+4);
+ __m128d XMM7 = _mm_loadu_pd((y)+i+6);
+ XMM4 = _mm_mul_pd(XMM4, XMM0);
+ XMM5 = _mm_mul_pd(XMM5, XMM1);
+ XMM6 = _mm_mul_pd(XMM6, XMM2);
+ XMM7 = _mm_mul_pd(XMM7, XMM3);
+ _mm_storeu_pd((y)+i , XMM4);
+ _mm_storeu_pd((y)+i+2, XMM5);
+ _mm_storeu_pd((y)+i+4, XMM6);
+ _mm_storeu_pd((y)+i+6, XMM7);
+ }
+ long off = (n) - ((n)%8);
+ for (i=0; i<((n)%8); i++) {
+ y[off+i] *= x[off+i];
+ }
+}
+
+
+static void THFloatVector_fill_SSE(float *x, const float c, const long n) {
+ long i;
+ __m128 XMM0 = _mm_set_ps1(c);
+ long off;
+ for (i=0; i<=((n)-16); i+=16) {
+ _mm_storeu_ps((x)+i , XMM0);
+ _mm_storeu_ps((x)+i+4, XMM0);
+ _mm_storeu_ps((x)+i+8, XMM0);
+ _mm_storeu_ps((x)+i+12, XMM0);
+ }
+ off = (n) - ((n)%16);
+ for (i=0; i<((n)%16); i++) {
+ x[off+i] = c;
+ }
+}
+
+
+static void THFloatVector_add_SSE(float *y, const float *x, const float c, const long n) {
+ long i = 0;
+ __m128 XMM7 = _mm_set_ps1(c);
+ __m128 XMM0,XMM2;
+ for (; i<=((n)-4); i+=4) {
+ XMM0 = _mm_loadu_ps((x)+i);
+ XMM2 = _mm_loadu_ps((y)+i);
+ XMM0 = _mm_mul_ps(XMM0, XMM7);
+ XMM2 = _mm_add_ps(XMM2, XMM0);
+ _mm_storeu_ps((y)+i , XMM2);
+ }
+ for (; i<(n); i++) {
+ y[i] += c * x[i];
+ }
+}
+
+
+static void THFloatVector_diff_SSE(float *z, const float *x, const float *y, const long n) {
+ long i;
+ for (i=0; i<=((n)-16); i+=16) {
+ __m128 XMM0 = _mm_loadu_ps((x)+i );
+ __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
+ __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
+ __m128 XMM3 = _mm_loadu_ps((x)+i+12);
+ __m128 XMM4 = _mm_loadu_ps((y)+i );
+ __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
+ __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
+ __m128 XMM7 = _mm_loadu_ps((y)+i+12);
+ XMM0 = _mm_sub_ps(XMM0, XMM4);
+ XMM1 = _mm_sub_ps(XMM1, XMM5);
+ XMM2 = _mm_sub_ps(XMM2, XMM6);
+ XMM3 = _mm_sub_ps(XMM3, XMM7);
+ _mm_storeu_ps((z)+i , XMM0);
+ _mm_storeu_ps((z)+i+ 4, XMM1);
+ _mm_storeu_ps((z)+i+ 8, XMM2);
+ _mm_storeu_ps((z)+i+12, XMM3);
+ }
+ long off = (n) - ((n)%16);
+ for (i=0; i<((n)%16); i++) {
+ z[off+i] = x[off+i] - y[off+i];
+ }
+}
+
+
+static void THFloatVector_scale_SSE(float *y, const float c, const long n) {
+ long i;
+ __m128 XMM7 = _mm_set_ps1(c);
+ for (i=0; i<=((n)-8); i+=8) {
+ __m128 XMM0 = _mm_loadu_ps((y)+i );
+ __m128 XMM1 = _mm_loadu_ps((y)+i+4);
+ XMM0 = _mm_mul_ps(XMM0, XMM7);
+ XMM1 = _mm_mul_ps(XMM1, XMM7);
+ _mm_storeu_ps((y)+i , XMM0);
+ _mm_storeu_ps((y)+i+4, XMM1);
+ }
+ long off = (n) - ((n)%8);
+ for (i=0; i<((n)%8); i++) {
+ y[off+i] *= c;
+ }
+}
+
+
+static void THFloatVector_mul_SSE(float *y, const float *x, const long n) {
+ long i;
+ for (i=0; i<=((n)-16); i+=16) {
+ __m128 XMM0 = _mm_loadu_ps((x)+i );
+ __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
+ __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
+ __m128 XMM3 = _mm_loadu_ps((x)+i+12);
+ __m128 XMM4 = _mm_loadu_ps((y)+i );
+ __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
+ __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
+ __m128 XMM7 = _mm_loadu_ps((y)+i+12);
+ XMM4 = _mm_mul_ps(XMM4, XMM0);
+ XMM5 = _mm_mul_ps(XMM5, XMM1);
+ XMM6 = _mm_mul_ps(XMM6, XMM2);
+ XMM7 = _mm_mul_ps(XMM7, XMM3);
+ _mm_storeu_ps((y)+i , XMM4);
+ _mm_storeu_ps((y)+i+ 4, XMM5);
+ _mm_storeu_ps((y)+i+ 8, XMM6);
+ _mm_storeu_ps((y)+i+12, XMM7);
+ }
+ long off = (n) - ((n)%16);
+ for (i=0; i<((n)%16); i++) {
+ y[off+i] *= x[off+i];
+ }
+}
diff --git a/lib/luaT/CMakeLists.txt b/lib/luaT/CMakeLists.txt
index b221a17..f33768c 100644
--- a/lib/luaT/CMakeLists.txt
+++ b/lib/luaT/CMakeLists.txt
@@ -13,6 +13,10 @@ if(BUILD_STATIC)
ADD_LIBRARY(luaT_static STATIC luaT.h luaT.c)
endif()
+SET_TARGET_PROPERTIES(luaT PROPERTIES
+ VERSION 0
+ SOVERSION 0)
+
IF(APPLE)
SET_TARGET_PROPERTIES(luaT PROPERTIES
LINK_FLAGS "-undefined dynamic_lookup")
diff --git a/lib/luaT/README.md b/lib/luaT/README.md
index 431e37f..f28d143 100644
--- a/lib/luaT/README.md
+++ b/lib/luaT/README.md
@@ -237,7 +237,7 @@ shall not be freed. It is a pointer inside `tname` string.
<a name="luat_classmodulename"/>
### int luaT_classmodulename(const char *tname, char *parent_name) ###
-Alias to `luaT_fullparentname ` for ensuring backwards compatibilty;
+Alias to `luaT_fullparentname ` for ensuring backwards compatibility;
use of `luaT_fullparentname` is preferred.
<a name="luat_fullparentname"/>
diff --git a/test/test.lua b/test/test.lua
index 20ca035..21df3b6 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -183,7 +183,7 @@ function torchtest.rsqrt()
end
function torchtest.sigmoid()
- -- cant use genericSingleOpTest, since `math.sigmoid` doesnt exist, have to use
+ -- can't use genericSingleOpTest, since `math.sigmoid` doesn't exist, have to use
-- `torch.sigmoid` instead
local inputValues = {-1000,-1,0,0.5,1,2,1000}
local expectedOutput = {0.0000, 0.2689, 0.5, 0.6225, 0.7311, 0.8808, 1.000}
@@ -2921,7 +2921,12 @@ function torchtest.abs()
end
-- Checking that the right abs function is called for LongTensor
- local bignumber = 2^31 + 1
+ local bignumber
+ if torch.LongTensor():elementSize() > 4 then
+ bignumber = 2^31 + 1
+ else
+ bignumber = 2^15 + 1
+ end
local input = torch.LongTensor{-bignumber}
mytester:assertgt(input:abs()[1], 0, 'torch.abs(3)')
end
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-torch7.git
More information about the debian-science-commits
mailing list