[lua-torch-torch7] 01/03: New upstream version 0~20161115-g552b086
Zhou Mo
cdluminate-guest at moszumanska.debian.org
Mon Nov 21 03:11:33 UTC 2016
This is an automated email from the git hooks/post-receive script.
cdluminate-guest pushed a commit to branch master
in repository lua-torch-torch7.
commit 584c9181a042f45b703b88b5c0bf05fbe546f386
Author: Zhou Mo <cdluminate at gmail.com>
Date: Mon Nov 21 03:10:14 2016 +0000
New upstream version 0~20161115-g552b086
---
.travis.yml | 5 -
doc/maths.md | 41 +++---
doc/random.md | 3 +
doc/tensor.md | 2 +-
lib/TH/CMakeLists.txt | 7 +-
lib/TH/THAllocator.c | 8 +-
lib/TH/THDiskFile.c | 2 -
lib/TH/THGeneral.c | 29 ++++
lib/TH/THGeneral.h.in | 3 +
lib/TH/THRandom.c | 4 +-
lib/TH/cmake/FindARM.cmake | 9 ++
lib/TH/generic/THTensorLapack.c | 93 +++++++++---
lib/TH/generic/THVector.h | 2 +-
lib/TH/generic/THVectorDispatch.c | 2 +-
lib/TH/vector/NEON.c | 296 ++++++++------------------------------
test/test.lua | 11 ++
utils.c | 21 +--
17 files changed, 228 insertions(+), 310 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index c28b4d1..a71c5e2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,11 +12,6 @@ env:
- TORCH_LUA_VERSION=LUA52
os:
- linux
-matrix:
- include:
- - os: osx
- env: TORCH_LUA_VERSION=LUAJIT21
- compiler: clang
addons:
apt:
packages:
diff --git a/doc/maths.md b/doc/maths.md
index dd427ea..252b52d 100755
--- a/doc/maths.md
+++ b/doc/maths.md
@@ -362,6 +362,13 @@ For more than 4 dimensions, you can use a storage: `y = torch.zeros(torch.LongSt
`x:atan()` replaces all elements in-place with the arctangent of the elements of `x`.
+<a name="torch.atan2"></a>
+### [res] torch.atan2([res,] x, y) ###
+<a name="torch.atan2"></a>
+
+`y = torch.atan2(x, y)` returns a new `Tensor` with the arctangent of the elements of `x` and `y`.
+
+`x:atan2()` replaces all elements in-place with the arctangent of the elements of `x` and `y`.
<a name="torch.ceil"></a>
### [res] torch.ceil([res,] x) ###
@@ -427,20 +434,17 @@ For more than 4 dimensions, you can use a storage: `y = torch.zeros(torch.LongSt
This function is more accurate than [`log`](#torch.log) for small values of `x`.
-<a name="x:neg"></a>
+<a name="torch.neg"></a>
### x:neg() ###
-<a name="x:neg"></a>
`x:neg()` replaces all elements in-place with the sign-reversed values of the elements of `x`.
-
-<a name="x:cinv"></a>
+<a name="torch.cinv"></a>
### x:cinv() ###
-<a name="x:cinv"></a>
+<a name="torch.cinv"></a>
`x:cinv()` replaces all elements in-place with `1.0 / x`.
-
<a name="torch.pow"></a>
### [res] torch.pow([res,] x, n) ###
<a name="torch.pow"></a>
@@ -632,18 +636,17 @@ The number of elements must match, but sizes do not matter.
`torch.add(z, x, value, y)` puts the result of `x + value * y` in `z`.
-<a name="x:csub"></a>
+<a name="torch.csub"></a>
### tensor:csub(value) ###
-<a name="x:csub"></a>
+<a name="torch.csub"></a>
Subtracts the given value from all elements in the `Tensor`, in place.
+<a name="torch.csub"></a>
+### tensor:csub(tensor2) ###
+<a name="torch.csub"></a>
-<a name="x:csub"></a>
-### tensor1:csub(tensor2) ###
-<a name="x:csub"></a>
-
-Subtracts `tensor2` from `tensor1`, in place.
+Subtracts `tensor2` from `tensor`, in place.
The number of elements must match, but sizes do not matter.
```lua
@@ -2704,36 +2707,38 @@ They return a `ByteTensor` in which each element is `0` or `1` indicating if the
Implements `<` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
-<a name="torch.lt"></a>
+<a name="torch.le"></a>
### torch.le(a, b) ###
Implements `<=` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
-<a name="torch.lt"></a>
+<a name="torch.gt"></a>
### torch.gt(a, b) ###
Implements `>` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
-<a name="torch.lt"></a>
+<a name="torch.ge"></a>
### torch.ge(a, b) ###
Implements `>=` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
-<a name="torch.lt"></a>
+<a name="torch.eq"></a>
### torch.eq(a, b) ###
Implements `==` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
-<a name="torch.lt"></a>
+<a name="torch.ne"></a>
### torch.ne(a, b) ###
Implements `~=` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
+<a name="torch.all"></a>
+<a name="torch.any"></a>
### torch.all(a) ###
### torch.any(a) ###
diff --git a/doc/random.md b/doc/random.md
index e6fa6ab..2bb2d1f 100644
--- a/doc/random.md
+++ b/doc/random.md
@@ -158,6 +158,9 @@ Returns a random real number according to the Cauchy distribution
Returns a random real number according to the log-normal distribution, with
the given `mean` and standard deviation `stdv`.
+`mean` and `stdv` are the corresponding mean and standard deviation of the underlying normal distribution,
+and not of the returned distribution.
+
`stdv` must be positive.
<a name="torch.geometric"></a>
diff --git a/doc/tensor.md b/doc/tensor.md
index fabaaa7..5809dc1 100644
--- a/doc/tensor.md
+++ b/doc/tensor.md
@@ -1592,7 +1592,7 @@ The source `tensor` should have at least as many elements as the number of 1s in
x = torch.Tensor({0, 0, 0, 0})
mask = torch.ByteTensor({0, 1, 0, 1})
y = torch.Tensor({10, 20})
-x:maskedCopy(y)
+x:maskedCopy(mask,y)
print(x)
0
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index 29343c7..e6cf91d 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -62,10 +62,13 @@ ENDIF (WITH_OPENMP)
# ARM specific flags
FIND_PACKAGE(ARM)
-IF (NEON_FOUND)
+IF (ASIMD_FOUND)
+ MESSAGE(STATUS "asimd/Neon found with compiler flag : -D__NEON__")
+ SET(CMAKE_C_FLAGS "-D__NEON__ ${CMAKE_C_FLAGS}")
+ELSEIF (NEON_FOUND)
MESSAGE(STATUS "Neon found with compiler flag : -mfpu=neon -D__NEON__")
SET(CMAKE_C_FLAGS "-mfpu=neon -D__NEON__ ${CMAKE_C_FLAGS}")
-ENDIF (NEON_FOUND)
+ENDIF (ASIMD_FOUND)
IF (CORTEXA8_FOUND)
MESSAGE(STATUS "Cortex-A8 Found with compiler flag : -mcpu=cortex-a8")
SET(CMAKE_C_FLAGS "-mcpu=cortex-a8 -fprefetch-loop-arrays ${CMAKE_C_FLAGS}")
diff --git a/lib/TH/THAllocator.c b/lib/TH/THAllocator.c
index 5b06502..4f4f04f 100644
--- a/lib/TH/THAllocator.c
+++ b/lib/TH/THAllocator.c
@@ -250,12 +250,8 @@ static void *_map_alloc(void* ctx_, ptrdiff_t size)
{
if(ctx->flags)
{
- /* if it is shared mem, let's put it in correct size */
- if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
- {
- if(ftruncate(fd, size) == -1)
- THError("unable to resize shared memory file <%s> to the right size", ctx->filename);
- }
+ if(ftruncate(fd, size) == -1)
+ THError("unable to resize file <%s> to the right size", ctx->filename);
if(fstat(fd, &file_stat) == -1 || file_stat.st_size < size)
{
close(fd);
diff --git a/lib/TH/THDiskFile.c b/lib/TH/THDiskFile.c
index 2ded7bd..9d9cbae 100644
--- a/lib/TH/THDiskFile.c
+++ b/lib/TH/THDiskFile.c
@@ -2,9 +2,7 @@
#include "THDiskFile.h"
#include "THFilePrivate.h"
-#ifdef _WIN64
#include <stdint.h>
-#endif
typedef struct THDiskFile__
{
diff --git a/lib/TH/THGeneral.c b/lib/TH/THGeneral.c
index 399403b..cb9c79e 100644
--- a/lib/TH/THGeneral.c
+++ b/lib/TH/THGeneral.c
@@ -1,6 +1,10 @@
#include "THGeneral.h"
#include "THAtomic.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
#ifndef TH_HAVE_THREAD
#define __thread
#elif _MSC_VER
@@ -314,3 +318,28 @@ double THLog1p(const double x)
return log1p(x);
#endif
}
+
+void THSetNumThreads(int num_threads)
+{
+#ifdef _OPENMP
+ omp_set_num_threads(num_threads);
+#endif
+}
+
+int THGetNumThreads(void)
+{
+#ifdef _OPENMP
+ return omp_get_max_threads();
+#else
+ return 1;
+#endif
+}
+
+int THGetNumCores(void)
+{
+#ifdef _OPENMP
+ return omp_get_num_procs();
+#else
+ return 1;
+#endif
+}
diff --git a/lib/TH/THGeneral.h.in b/lib/TH/THGeneral.h.in
index ff41159..bc7e448 100644
--- a/lib/TH/THGeneral.h.in
+++ b/lib/TH/THGeneral.h.in
@@ -64,6 +64,9 @@ TH_API void THFree(void *ptr);
TH_API void THSetGCHandler( void (*torchGCHandlerFunction)(void *data), void *data );
// this hook should only be called by custom allocator functions
TH_API void THHeapUpdate(ptrdiff_t size);
+TH_API void THSetNumThreads(int num_threads);
+TH_API int THGetNumThreads(void);
+TH_API int THGetNumCores(void);
#define THError(...) _THError(__FILE__, __LINE__, __VA_ARGS__)
diff --git a/lib/TH/THRandom.c b/lib/TH/THRandom.c
index 55ee943..fbaf282 100644
--- a/lib/TH/THRandom.c
+++ b/lib/TH/THRandom.c
@@ -255,10 +255,8 @@ double THRandom_cauchy(THGenerator *_generator, double median, double sigma)
M'enfin. */
double THRandom_logNormal(THGenerator *_generator, double mean, double stdv)
{
- double zm = mean*mean;
- double zs = stdv*stdv;
THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive");
- return(exp(THRandom_normal(_generator, log(zm/sqrt(zs + zm)), sqrt(log(zs/zm+1)) )));
+ return(exp(THRandom_normal(_generator, mean, stdv)));
}
int THRandom_geometric(THGenerator *_generator, double p)
diff --git a/lib/TH/cmake/FindARM.cmake b/lib/TH/cmake/FindARM.cmake
index cf1f8fd..59c78d8 100644
--- a/lib/TH/cmake/FindARM.cmake
+++ b/lib/TH/cmake/FindARM.cmake
@@ -13,6 +13,15 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
set(NEON_FOUND false CACHE BOOL "NEON available on host")
ENDIF (NEON_TRUE)
+ # on ARMv8, neon is inherit and instead listed as 'asimd' in /proc/cpuinfo
+ STRING(REGEX REPLACE "^.*(asimd).*$" "\\1" ASIMD_THERE ${CPUINFO})
+ STRING(COMPARE EQUAL "asimd" "${ASIMD_THERE}" ASIMD_TRUE)
+ IF (ASIMD_TRUE)
+ set(ASIMD_FOUND true CACHE BOOL "ASIMD/NEON available on host")
+ ELSE (ASIMD_TRUE)
+ set(ASIMD_FOUND false CACHE BOOL "ASIMD/NEON available on host")
+ ENDIF (ASIMD_TRUE)
+
#Find the processor type (for now OMAP3 or OMAP4)
STRING(REGEX REPLACE "^.*(OMAP3).*$" "\\1" OMAP3_THERE ${CPUINFO})
STRING(COMPARE EQUAL "OMAP3" "${OMAP3_THERE}" OMAP3_TRUE)
diff --git a/lib/TH/generic/THTensorLapack.c b/lib/TH/generic/THTensorLapack.c
index 62d730a..fb1e246 100644
--- a/lib/TH/generic/THTensorLapack.c
+++ b/lib/TH/generic/THTensorLapack.c
@@ -103,12 +103,23 @@ static THTensor *THTensor_(cloneColumnMajor)(THTensor *self, THTensor *src)
void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
{
+ int free_b = 0;
if (a == NULL) a = ra_;
if (b == NULL) b = rb_;
- THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
- THArgCheck(b->nDimension == 2, 1, "B should be 2 dimensional");
- THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
- THArgCheck(a->size[0] == b->size[0], 2, "A,b size incompatible");
+ THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+ a->nDimension);
+ THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+ "dimensions, but has %d", b->nDimension);
+ THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+ a->size[0], a->size[1]);
+ THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+ "rows, B has %ld", a->size[0], b->size[0]);
+
+ if (b->nDimension == 1) {
+ b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+ b->stride[0], 1, 0);
+ free_b = 1;
+ }
int n, nrhs, lda, ldb, info;
THIntTensor *ipiv;
@@ -132,23 +143,36 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
THCleanup(
THTensor_(free)(ra__);
THTensor_(free)(rb__);
- THIntTensor_free(ipiv);),
+ THIntTensor_free(ipiv);
+ if (free_b) THTensor_(free)(b);),
"gesv", info, info);
THTensor_(freeCopyTo)(ra__, ra_);
THTensor_(freeCopyTo)(rb__, rb_);
THIntTensor_free(ipiv);
+ if (free_b) THTensor_(free)(b);
}
void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
const char *uplo, const char *trans, const char *diag)
{
+ int free_b = 0;
if (a == NULL) a = ra_;
if (b == NULL) b = rb_;
- THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
- THArgCheck(b->nDimension == 2, 1, "A should be 2 dimensional");
- THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
- THArgCheck(b->size[0] == a->size[0], 2, "A,b size incompatible");
+ THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+ a->nDimension);
+ THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+ "dimensions, but has %d", b->nDimension);
+ THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+ a->size[0], a->size[1]);
+ THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+ "rows, B has %ld", a->size[0], b->size[0]);
+
+ if (b->nDimension == 1) {
+ b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+ b->stride[0], 1, 0);
+ free_b = 1;
+ }
int n, nrhs, lda, ldb, info;
THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
@@ -168,21 +192,35 @@ void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
- THCleanup(THTensor_(free)(ra__); THTensor_(free)(rb__);),
+ THCleanup(
+ THTensor_(free)(ra__);
+ THTensor_(free)(rb__);
+ if (free_b) THTensor_(free)(b);),
"trtrs", info, info);
THTensor_(freeCopyTo)(ra__, ra_);
THTensor_(freeCopyTo)(rb__, rb_);
+ if (free_b) THTensor_(free)(b);
}
void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
{
+ int free_b = 0;
// Note that a = NULL is interpreted as a = ra_, and b = NULL as b = rb_.
if (a == NULL) a = ra_;
if (b == NULL) b = rb_;
- THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
- THArgCheck(b->nDimension == 2, 1, "B should be 2 dimensional");
- THArgCheck(a->size[0] == b->size[0], 2, "size incompatible A,b");
+ THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+ a->nDimension);
+ THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+ "dimensions, but has %d", b->nDimension);
+ THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+ "rows, B has %ld", a->size[0], b->size[0]);
+
+ if (b->nDimension == 1) {
+ b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+ b->stride[0], 1, 0);
+ free_b = 1;
+ }
int m, n, nrhs, lda, ldb, info, lwork;
THTensor *work = NULL;
@@ -217,7 +255,8 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
THCleanup(THTensor_(free)(ra__);
THTensor_(free)(rb__);
- THTensor_(free)(work);),
+ THTensor_(free)(work);
+ if (free_b) THTensor_(free)(b);),
"gels", info,"");
/* rb__ is currently ldb by nrhs; resize it to n by nrhs */
@@ -228,6 +267,7 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
THTensor_(freeCopyTo)(ra__, ra_);
THTensor_(freeCopyTo)(rb__, rb_);
THTensor_(free)(work);
+ if (free_b) THTensor_(free)(b);
}
void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr)
@@ -312,6 +352,7 @@ void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz
{
if (a == NULL) a = rv_;
THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+ THArgCheck(a->size[0] == a->size[1], 1,"A should be square");
int n, lda, lwork, info;
THTensor *work;
@@ -562,9 +603,9 @@ void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo)
/* Run Factorization */
THLapack_(potrf)(uplo[0], n, THTensor_(data)(ra__), lda, &info);
- THLapackCheckWithCleanup("Lapack Error %s : A(%d,%d) is 0, A cannot be factorized",
+ THLapackCheckWithCleanup("Lapack Error in %s : the leading minor of order %d is not positive definite",
THCleanup(THTensor_(free)(ra__);),
- "potrf", info, info);
+ "potrf", info);
THTensor_(clearUpLoTriangle)(ra__, uplo);
THTensor_(freeCopyTo)(ra__, ra_);
@@ -572,9 +613,23 @@ void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo)
void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
{
+ int free_b = 0;
if (b == NULL) b = rb_;
- THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+ THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+ a->nDimension);
+ THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+ "dimensions, but has %d", b->nDimension);
+ THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+ a->size[0], a->size[1]);
+ THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+ "rows, B has %ld", a->size[0], b->size[0]);
+
+ if (b->nDimension == 1) {
+ b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+ b->stride[0], 1, 0);
+ free_b = 1;
+ }
int n, nrhs, lda, ldb, info;
THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
@@ -595,9 +650,11 @@ void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
THCleanup(
THTensor_(free)(ra__);
- THTensor_(free)(rb__);),
+ THTensor_(free)(rb__);
+ if (free_b) THTensor_(free)(b);),
"potrs", info, info);
+ if (free_b) THTensor_(free)(b);
THTensor_(free)(ra__);
THTensor_(freeCopyTo)(rb__, rb_);
}
diff --git a/lib/TH/generic/THVector.h b/lib/TH/generic/THVector.h
index 5326b16..67fdcfa 100644
--- a/lib/TH/generic/THVector.h
+++ b/lib/TH/generic/THVector.h
@@ -9,6 +9,6 @@ TH_API void THVector_(scale)(real *y, const real c, const ptrdiff_t n);
TH_API void THVector_(mul)(real *y, const real *x, const ptrdiff_t n);
/* Initialize the dispatch pointers */
-TH_API void THVector_(vectorDispatchInit)();
+TH_API void THVector_(vectorDispatchInit)(void);
#endif
diff --git a/lib/TH/generic/THVectorDispatch.c b/lib/TH/generic/THVectorDispatch.c
index eae5073..6fd1d68 100644
--- a/lib/TH/generic/THVectorDispatch.c
+++ b/lib/TH/generic/THVectorDispatch.c
@@ -127,7 +127,7 @@ void THVector_(mul)(real *y, const real *x, const ptrdiff_t n) {
* This means that in the dispatch tables, implementations supporting more recent extensions
* need to come first
*/
-void THVector_(vectorDispatchInit)()
+void THVector_(vectorDispatchInit)(void)
{
uint32_t hostSimdExts = detectHostSIMDExtensions();
INIT_DISPATCH_PTR(fill);
diff --git a/lib/TH/vector/NEON.c b/lib/TH/vector/NEON.c
index bc7cb2b..327b006 100644
--- a/lib/TH/vector/NEON.c
+++ b/lib/TH/vector/NEON.c
@@ -1,252 +1,78 @@
static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n) {
- float ctemp = c;
- float * caddr = &ctemp;
- __asm__ __volatile__ (
- "mov r0, %0 @ \n\t"
- "ldr r4, [%1] @ \n\t"
- "vdup.32 q12, r4 @ \n\t"
- "vdup.32 q13, r4 @ \n\t"
- "lsrs r4, %2, #3 @ \n\t"
- "beq 3f @ \n\t"
- "1: @ \n\t"
- "vst1.32 {d24-d27}, [r0]! @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "bne 1b @ \n\t"
- "3: @ \n\t"
- "ands r4, %2, #7 @ \n\t"
- "beq 5f @ \n\t"
- "4: @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "vst1.32 {d24[0]}, [r0]! @ \n\t"
- "bne 4b @ \n\t"
- "5: @ "
- :
- :"r" (x), "r"(caddr),"r"(n)
- : "cc", "r0", "r4", "memory",
- "q12",
- "d24", "d25", "d26", "d27"
- );
+ long i = 0;
+
+ for(; i < n-4; i += 4)
+ {
+ x[i] = c;
+ x[i+1] = c;
+ x[i+2] = c;
+ x[i+3] = c;
+ }
+
+ for(; i < n; i++)
+ x[i] = c;
+
}
static void THFloatVector_diff_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
- __asm__ __volatile__ (
- "mov r0, %2 @ \n\t"
- "mov r1, %1 @ \n\t"
- "mov r2, %0 @ \n\t"
- "lsrs r4, %3, #3 @ \n\t"
- "beq 3f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "1: @ \n\t"
- "vsub.f32 q12, q8, q0 @ \n\t"
- "vsub.f32 q13, q9, q1 @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "beq 2f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vst1.32 {d24-d27}, [r2]! @ \n\t"
- "b 1b @ \n\t"
- "2: @ \n\t"
- "vst1.32 {d24-d27}, [r2]! @ \n\t"
- "3: @ \n\t"
- "ands r4, %3, #7 @ \n\t"
- "beq 5f @ \n\t"
- "4: @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "vld1.32 {d16[0]}, [r1]! @ \n\t"
- "vld1.32 {d0[0]}, [r0]! @ \n\t"
- "vsub.f32 d24, d16, d0 @ \n\t"
- "vst1.32 {d24[0]}, [r2]! @ \n\t"
- "bne 4b @ \n\t"
- "5: @ "
- :
- :"r" (z), "r" (x),"r" (y), "r"(n)
- : "cc", "r0", "r1", "r2", "r4", "memory",
- "q0", "q1", "q8", "q9", "q12", "q13",
- "d0", "d1", "d2", "d3",
- "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
- );
+ long i = 0;
+
+ for(; i < n-4; i += 4)
+ {
+ z[i] = x[i] - y[i];
+ z[i+1] = x[i+1] - y[i+1];
+ z[i+2] = x[i+2] - y[i+2];
+ z[i+3] = x[i+3] - y[i+3];
+ }
+
+ for(; i < n; i++)
+ z[i] = x[i] - y[i];
+
}
static void THFloatVector_scale_NEON(float *y, const float c, const ptrdiff_t n) {
- float ctemp = c;
- float * caddr = &ctemp;
- __asm__ __volatile__ (
- "mov r0, %0 @ \n\t"
- "mov r2, r0 @ \n\t"
- "ldr r5, [%1] @ \n\t"
- "vdup.32 q14, r5 @ \n\t"
- "lsrs r5, %2, #5 @ \n\t"
- "beq 3f @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vld1.32 {d4-d7}, [r0]! @ \n\t"
- "vld1.32 {d8-d11}, [r0]! @ \n\t"
- "vld1.32 {d12-d15}, [r0]! @ \n\t"
- "1: @ \n\t"
- "vmul.f32 q0, q0, q14 @ \n\t"
- "vmul.f32 q1, q1, q14 @ \n\t"
- "vmul.f32 q2, q2, q14 @ \n\t"
- "vmul.f32 q3, q3, q14 @ \n\t"
- "vmul.f32 q4, q4, q14 @ \n\t"
- "vmul.f32 q5, q5, q14 @ \n\t"
- "vmul.f32 q6, q6, q14 @ \n\t"
- "vmul.f32 q7, q7, q14 @ \n\t"
- "subs r5, r5, #1 @ \n\t"
- "beq 2f @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vst1.32 {d4-d7}, [r2]! @ \n\t"
- "vld1.32 {d4-d7}, [r0]! @ \n\t"
- "vst1.32 {d8-d11}, [r2]! @ \n\t"
- "vld1.32 {d8-d11}, [r0]! @ \n\t"
- "vst1.32 {d12-d15}, [r2]! @ \n\t"
- "vld1.32 {d12-d15}, [r0]! @ \n\t"
- "b 1b @ \n\t"
- "2: @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "vst1.32 {d4-d7}, [r2]! @ \n\t"
- "vst1.32 {d8-d11}, [r2]! @ \n\t"
- "vst1.32 {d12-d15}, [r2]! @ \n\t"
- "3: @ \n\t"
- "lsrs r5, %2, #4 @ \n\t"
- "ands r5, r5, #1 @ \n\t"
- "beq 4f @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vld1.32 {d4-d7}, [r0]! @ \n\t"
- "vmul.f32 q0, q0, q14 @ \n\t"
- "vmul.f32 q1, q1, q14 @ \n\t"
- "vmul.f32 q2, q2, q14 @ \n\t"
- "vmul.f32 q3, q3, q14 @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "vst1.32 {d4-d7}, [r2]! @ \n\t"
- "4: @ \n\t"
- "lsrs r5, %2, #3 @ \n\t"
- "ands r5, r5, #1 @ \n\t"
- "beq 5f @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vmul.f32 q0, q0, q14 @ \n\t"
- "vmul.f32 q1, q1, q14 @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "5: @ \n\t"
- "ands r5, %2, #7 @ \n\t"
- "beq 7f @ \n\t"
- "6: @ \n\t"
- "subs r5, r5, #1 @ \n\t"
- "vld1.32 d0[0], [r0]! @ \n\t"
- "vmul.f32 d0, d0, d28 @ \n\t"
- "vst1.32 d0[0], [r2]! @ \n\t"
- "bne 6b @ \n\t"
- "7: @ "
- :
- :"r" (y), "r"(caddr),"r"(n)
- : "cc", "r0", "r2", "r5", "memory",
- "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14",
- "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
- "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
- "d28", "d29"
- );
+ long i = 0;
+ for(; i < n-4; i +=4)
+ {
+ y[i] *= c;
+ y[i+1] *= c;
+ y[i+2] *= c;
+ y[i+3] *= c;
+ }
+
+ for(; i < n; i++)
+ y[i] *= c;
}
static void THFloatVector_mul_NEON(float *y, const float *x, const ptrdiff_t n) {
- __asm__ __volatile__ (
- "mov r0, %0 @ \n\t"
- "mov r1, %1 @ \n\t"
- "mov r2, r0 @ \n\t"
- "lsrs r4, %2, #3 @ \n\t"
- "beq 3f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "1: @ \n\t"
- "vmul.f32 q12, q8, q0 @ \n\t"
- "vmul.f32 q13, q9, q1 @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "beq 2f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vst1.32 {d24-d27}, [r2]! @ \n\t"
- "b 1b @ \n\t"
- "2: @ \n\t"
- "vst1.32 {d24-d27}, [r2]! @ \n\t"
- "3: @ \n\t"
- "ands r4, %2, #7 @ \n\t"
- "beq 5f @ \n\t"
- "4: @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "vld1.32 {d16[0]}, [r1]! @ \n\t"
- "vld1.32 {d0[0]}, [r0]! @ \n\t"
- "vmul.f32 q12, q8, q0 @ \n\t"
- "vst1.32 {d24[0]}, [r2]! @ \n\t"
- "bne 4b @ \n\t"
- "5: @ "
- :
- :"r" (y),"r" (x),"r"(n)
- : "cc", "r0", "r1", "r2", "r4", "memory",
- "q0", "q1", "q8", "q9", "q12", "q13",
- "d0", "d1", "d2", "d3",
- "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
- );
+ long i = 0;
+
+ for(; i < n-4; i += 4)
+ {
+ y[i] *= x[i];
+ y[i+1] *= x[i+1];
+ y[i+2] *= x[i+2];
+ y[i+3] *= x[i+3];
+ }
+
+ for(; i < n; i++)
+ y[i] *= x[i];
}
static void THFloatVector_add_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
- float ctemp = c;
- float * caddr = &ctemp;
- __asm__ __volatile__ (
- "mov r0, %0 @ \n\t"
- "mov r1, %1 @ \n\t"
- "mov r2, r0 @ \n\t"
- "ldr r5, [%2] @ \n\t"
- "vdup.32 q14, r5 @ \n\t"
- "lsrs r5, %3, #4 @ \n\t"
- "beq 3f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vld1.32 {d20-d23}, [r1]! @ \n\t"
- "vld1.32 {d4-d7}, [r0]! @ \n\t"
- "1: @ \n\t"
- "vmla.f32 q0, q8, q14 @ \n\t"
- "vmla.f32 q1, q9, q14 @ \n\t"
- "vmla.f32 q2, q10, q14 @ \n\t"
- "vmla.f32 q3, q11, q14 @ \n\t"
- "subs r5, r5, #1 @ \n\t"
- "beq 2f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d20-d23}, [r1]! @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vst1.32 {d4-d7}, [r2]! @ \n\t"
- "vld1.32 {d4-d7}, [r0]! @ \n\t"
- "b 1b @ \n\t"
- "2: @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "vst1.32 {d4-d7}, [r2]! @ \n\t"
- "3: @ \n\t"
- "lsrs r5, %3, #3 @ \n\t"
- "ands r5, #1 @ \n\t"
- "beq 4f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vmla.f32 q0, q8, q14 @ \n\t"
- "vmla.f32 q1, q9, q14 @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "4: @ \n\t"
- "ands r5, %3, #7 @ \n\t"
- "beq 6f @ \n\t"
- "5: @ \n\t"
- "subs r5, r5, #1 @ \n\t"
- "vld1.32 {d16[0]}, [r1]! @ \n\t"
- "vld1.32 {d0[0]}, [r0]! @ \n\t"
- "vmla.f32 d0, d16, d28 @ \n\t"
- "vst1.32 d0[0], [r2]! @ \n\t"
- "bne 5b @ \n\t"
- "6: @ "
- :
- :"r" (y),"r" (x), "r"(caddr),"r"(n)
- : "cc", "r0", "r1", "r2", "r5", "memory",
- "q0", "q1", "q2", "q3", "q14",
- "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
- "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29"
- );
+ long i = 0;
+
+ for(;i < n-4; i += 4)
+ {
+ y[i] += c * x[i];
+ y[i+1] += c * x[i+1];
+ y[i+2] += c * x[i+2];
+ y[i+3] += c * x[i+3];
+ }
+
+ for(; i < n; i++)
+ y[i] += c * x[i];
}
diff --git a/test/test.lua b/test/test.lua
index 4290036..3eb119f 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -3435,6 +3435,17 @@ function torchtest.bernoulli()
mytester:assert(isBinary(t), 'Sample from torch.bernoulli is not binary')
end
+function torchtest.logNormal()
+ local t = torch.FloatTensor(10, 10)
+ local mean, std = torch.uniform(), 0.1 * torch.uniform()
+ local tolerance = 0.02
+
+ t:logNormal(mean, std)
+ local logt = t:log()
+ mytester:assertalmosteq(logt:mean(), mean, tolerance, 'mean is wrong')
+ mytester:assertalmosteq(logt:std(), std, tolerance, 'tolerance is wrong')
+end
+
function torch.test(tests)
torch.setheaptracking(true)
math.randomseed(os.time())
diff --git a/utils.c b/utils.c
index eb7ff53..894bb6e 100644
--- a/utils.c
+++ b/utils.c
@@ -7,10 +7,6 @@
# include <sys/time.h>
#endif
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
THLongStorage* torch_checklongargs(lua_State *L, int index)
{
THLongStorage *storage;
@@ -171,30 +167,19 @@ const char* torch_getdefaulttensortype(lua_State *L)
static int torch_getnumthreads(lua_State *L)
{
-#ifdef _OPENMP
- lua_pushinteger(L, omp_get_max_threads());
-#else
- lua_pushinteger(L, 1);
-#endif
+ lua_pushinteger(L, THGetNumThreads());
return 1;
}
static int torch_setnumthreads(lua_State *L)
{
-#ifdef _OPENMP
- int nth = luaL_checkint(L,1);
- omp_set_num_threads(nth);
-#endif
+ THSetNumThreads(luaL_checkint(L, 1));
return 0;
}
static int torch_getnumcores(lua_State *L)
{
-#ifdef _OPENMP
- lua_pushinteger(L, omp_get_num_procs());
-#else
- lua_pushinteger(L, 1);
-#endif
+ lua_pushinteger(L, THGetNumCores());
return 1;
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-torch7.git
More information about the debian-science-commits
mailing list