[lua-torch-torch7] 01/03: New upstream version 0~20161115-g552b086

Mon Nov 21 03:11:33 UTC 2016

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-torch7.

commit 584c9181a042f45b703b88b5c0bf05fbe546f386
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Mon Nov 21 03:10:14 2016 +0000

    New upstream version 0~20161115-g552b086
---
 .travis.yml                       |   5 -
 doc/maths.md                      |  41 +++---
 doc/random.md                     |   3 +
 doc/tensor.md                     |   2 +-
 lib/TH/CMakeLists.txt             |   7 +-
 lib/TH/THAllocator.c              |   8 +-
 lib/TH/THDiskFile.c               |   2 -
 lib/TH/THGeneral.c                |  29 ++++
 lib/TH/THGeneral.h.in             |   3 +
 lib/TH/THRandom.c                 |   4 +-
 lib/TH/cmake/FindARM.cmake        |   9 ++
 lib/TH/generic/THTensorLapack.c   |  93 +++++++++---
 lib/TH/generic/THVector.h         |   2 +-
 lib/TH/generic/THVectorDispatch.c |   2 +-
 lib/TH/vector/NEON.c              | 296 ++++++++------------------------------
 test/test.lua                     |  11 ++
 utils.c                           |  21 +--
 17 files changed, 228 insertions(+), 310 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index c28b4d1..a71c5e2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,11 +12,6 @@ env:
   - TORCH_LUA_VERSION=LUA52
 os:
   - linux
-matrix:
-  include:
-  - os: osx
-    env: TORCH_LUA_VERSION=LUAJIT21
-    compiler: clang
 addons:
   apt:
     packages:
diff --git a/doc/maths.md b/doc/maths.md
index dd427ea..252b52d 100755
--- a/doc/maths.md
+++ b/doc/maths.md
@@ -362,6 +362,13 @@ For more than 4 dimensions, you can use a storage: `y = torch.zeros(torch.LongSt
 
 `x:atan()` replaces all elements in-place with the arctangent of the elements of `x`.
 
+<a name="torch.atan2"></a>
+### [res] torch.atan2([res,] x, y) ###
+<a name="torch.atan2"></a>
+
+`y = torch.atan2(x, y)` returns a new `Tensor` with the arctangent of the elements of `x` and `y`.
+
+`x:atan2()` replaces all elements in-place with the arctangent of the elements of `x` and `y`.
 
 <a name="torch.ceil"></a>
 ### [res] torch.ceil([res,] x) ###
@@ -427,20 +434,17 @@ For more than 4 dimensions, you can use a storage: `y = torch.zeros(torch.LongSt
 This function is more accurate than [`log`](#torch.log) for small values of `x`.
 
 
-<a name="x:neg"></a>
+<a name="torch.neg"></a>
 ### x:neg() ###
-<a name="x:neg"></a>
 
 `x:neg()` replaces all elements in-place with the sign-reversed values of the elements of `x`.
 
-
-<a name="x:cinv"></a>
+<a name="torch.cinv"></a>
 ### x:cinv() ###
-<a name="x:cinv"></a>
+<a name="torch.cinv"></a>
 
 `x:cinv()` replaces all elements in-place with `1.0 / x`.
 
-
 <a name="torch.pow"></a>
 ### [res] torch.pow([res,] x, n) ###
 <a name="torch.pow"></a>
@@ -632,18 +636,17 @@ The number of elements must match, but sizes do not matter.
 `torch.add(z, x, value, y)` puts the result of `x + value * y` in `z`.
 
 
-<a name="x:csub"></a>
+<a name="torch.csub"></a>
 ### tensor:csub(value) ###
-<a name="x:csub"></a>
+<a name="torch.csub"></a>
 
 Subtracts the given value from all elements in the `Tensor`, in place.
 
+<a name="torch.csub"></a>
+### tensor:csub(tensor2) ###
+<a name="torch.csub"></a>
 
-<a name="x:csub"></a>
-### tensor1:csub(tensor2) ###
-<a name="x:csub"></a>
-
-Subtracts `tensor2` from `tensor1`, in place.
+Subtracts `tensor2` from `tensor`, in place.
 The number of elements must match, but sizes do not matter.
 
 ```lua
@@ -2704,36 +2707,38 @@ They return a `ByteTensor` in which each element is `0` or `1` indicating if the
 Implements `<` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
 
 
-<a name="torch.lt"></a>
+<a name="torch.le"></a>
 ### torch.le(a, b) ###
 
 Implements `<=` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
 
 
-<a name="torch.lt"></a>
+<a name="torch.gt"></a>
 ### torch.gt(a, b) ###
 
 Implements `>` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
 
 
-<a name="torch.lt"></a>
+<a name="torch.ge"></a>
 ### torch.ge(a, b) ###
 
 Implements `>=` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
 
 
-<a name="torch.lt"></a>
+<a name="torch.eq"></a>
 ### torch.eq(a, b) ###
 
 Implements `==` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
 
 
-<a name="torch.lt"></a>
+<a name="torch.ne"></a>
 ### torch.ne(a, b) ###
 
 Implements `~=` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
 
 
+<a name="torch.all"></a>
+<a name="torch.any"></a>
 ### torch.all(a) ###
 ### torch.any(a) ###
 
diff --git a/doc/random.md b/doc/random.md
index e6fa6ab..2bb2d1f 100644
--- a/doc/random.md
+++ b/doc/random.md
@@ -158,6 +158,9 @@ Returns a random real number according to the Cauchy distribution
 
 Returns a random real number according to the log-normal distribution, with
 the given `mean` and standard deviation `stdv`.
+`mean` and `stdv` are the corresponding mean and standard deviation of the underlying normal distribution, 
+and not of the returned distribution.
+
 `stdv` must be positive.
 
 <a name="torch.geometric"></a>
diff --git a/doc/tensor.md b/doc/tensor.md
index fabaaa7..5809dc1 100644
--- a/doc/tensor.md
+++ b/doc/tensor.md
@@ -1592,7 +1592,7 @@ The source `tensor` should have at least as many elements as the number of 1s in
 x = torch.Tensor({0, 0, 0, 0})
 mask = torch.ByteTensor({0, 1, 0, 1})
 y = torch.Tensor({10, 20})
-x:maskedCopy(y)
+x:maskedCopy(mask,y)
 print(x)
 
   0
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index 29343c7..e6cf91d 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -62,10 +62,13 @@ ENDIF (WITH_OPENMP)
 
 # ARM specific flags
 FIND_PACKAGE(ARM)
-IF (NEON_FOUND)
+IF (ASIMD_FOUND)
+  MESSAGE(STATUS "asimd/Neon found with compiler flag : -D__NEON__")
+  SET(CMAKE_C_FLAGS "-D__NEON__ ${CMAKE_C_FLAGS}")
+ELSEIF (NEON_FOUND)
   MESSAGE(STATUS "Neon found with compiler flag : -mfpu=neon -D__NEON__")
   SET(CMAKE_C_FLAGS "-mfpu=neon -D__NEON__ ${CMAKE_C_FLAGS}")
-ENDIF (NEON_FOUND)
+ENDIF (ASIMD_FOUND)
 IF (CORTEXA8_FOUND)
   MESSAGE(STATUS "Cortex-A8 Found with compiler flag : -mcpu=cortex-a8")
   SET(CMAKE_C_FLAGS "-mcpu=cortex-a8 -fprefetch-loop-arrays ${CMAKE_C_FLAGS}")
diff --git a/lib/TH/THAllocator.c b/lib/TH/THAllocator.c
index 5b06502..4f4f04f 100644
--- a/lib/TH/THAllocator.c
+++ b/lib/TH/THAllocator.c
@@ -250,12 +250,8 @@ static void *_map_alloc(void* ctx_, ptrdiff_t size)
       {
         if(ctx->flags)
         {
-          /* if it is shared mem, let's put it in correct size */
-          if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
-          {
-            if(ftruncate(fd, size) == -1)
-              THError("unable to resize shared memory file <%s> to the right size", ctx->filename);
-          }
+          if(ftruncate(fd, size) == -1)
+            THError("unable to resize file <%s> to the right size", ctx->filename);
           if(fstat(fd, &file_stat) == -1 || file_stat.st_size < size)
           {
             close(fd);
diff --git a/lib/TH/THDiskFile.c b/lib/TH/THDiskFile.c
index 2ded7bd..9d9cbae 100644
--- a/lib/TH/THDiskFile.c
+++ b/lib/TH/THDiskFile.c
@@ -2,9 +2,7 @@
 #include "THDiskFile.h"
 #include "THFilePrivate.h"
 
-#ifdef _WIN64
 #include <stdint.h>
-#endif
 
 typedef struct THDiskFile__
 {
diff --git a/lib/TH/THGeneral.c b/lib/TH/THGeneral.c
index 399403b..cb9c79e 100644
--- a/lib/TH/THGeneral.c
+++ b/lib/TH/THGeneral.c
@@ -1,6 +1,10 @@
 #include "THGeneral.h"
 #include "THAtomic.h"
 
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
 #ifndef TH_HAVE_THREAD
 #define __thread
 #elif _MSC_VER
@@ -314,3 +318,28 @@ double THLog1p(const double x)
   return log1p(x);
 #endif
 }
+
+void THSetNumThreads(int num_threads)
+{
+#ifdef _OPENMP
+  omp_set_num_threads(num_threads);
+#endif
+}
+
+int THGetNumThreads(void)
+{
+#ifdef _OPENMP
+  return omp_get_max_threads();
+#else
+  return 1;
+#endif
+}
+
+int THGetNumCores(void)
+{
+#ifdef _OPENMP
+  return omp_get_num_procs();
+#else
+  return 1;
+#endif
+}
diff --git a/lib/TH/THGeneral.h.in b/lib/TH/THGeneral.h.in
index ff41159..bc7e448 100644
--- a/lib/TH/THGeneral.h.in
+++ b/lib/TH/THGeneral.h.in
@@ -64,6 +64,9 @@ TH_API void THFree(void *ptr);
 TH_API void THSetGCHandler( void (*torchGCHandlerFunction)(void *data), void *data );
 // this hook should only be called by custom allocator functions
 TH_API void THHeapUpdate(ptrdiff_t size);
+TH_API void THSetNumThreads(int num_threads);
+TH_API int THGetNumThreads(void);
+TH_API int THGetNumCores(void);
 
 #define THError(...) _THError(__FILE__, __LINE__, __VA_ARGS__)
 
diff --git a/lib/TH/THRandom.c b/lib/TH/THRandom.c
index 55ee943..fbaf282 100644
--- a/lib/TH/THRandom.c
+++ b/lib/TH/THRandom.c
@@ -255,10 +255,8 @@ double THRandom_cauchy(THGenerator *_generator, double median, double sigma)
    M'enfin. */
 double THRandom_logNormal(THGenerator *_generator, double mean, double stdv)
 {
-  double zm = mean*mean;
-  double zs = stdv*stdv;
   THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive");
-  return(exp(THRandom_normal(_generator, log(zm/sqrt(zs + zm)), sqrt(log(zs/zm+1)) )));
+  return(exp(THRandom_normal(_generator, mean, stdv)));
 }
 
 int THRandom_geometric(THGenerator *_generator, double p)
diff --git a/lib/TH/cmake/FindARM.cmake b/lib/TH/cmake/FindARM.cmake
index cf1f8fd..59c78d8 100644
--- a/lib/TH/cmake/FindARM.cmake
+++ b/lib/TH/cmake/FindARM.cmake
@@ -13,6 +13,15 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
       set(NEON_FOUND false CACHE BOOL "NEON available on host")
    ENDIF (NEON_TRUE)
 
+   # on ARMv8, neon is inherit and instead listed as 'asimd' in /proc/cpuinfo
+   STRING(REGEX REPLACE "^.*(asimd).*$" "\\1" ASIMD_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "asimd" "${ASIMD_THERE}" ASIMD_TRUE)
+   IF (ASIMD_TRUE)
+      set(ASIMD_FOUND true CACHE BOOL "ASIMD/NEON available on host")
+   ELSE (ASIMD_TRUE)
+      set(ASIMD_FOUND false CACHE BOOL "ASIMD/NEON available on host")
+   ENDIF (ASIMD_TRUE)
+
    #Find the processor type (for now OMAP3 or OMAP4)
    STRING(REGEX REPLACE "^.*(OMAP3).*$" "\\1" OMAP3_THERE ${CPUINFO})
    STRING(COMPARE EQUAL "OMAP3" "${OMAP3_THERE}" OMAP3_TRUE)
diff --git a/lib/TH/generic/THTensorLapack.c b/lib/TH/generic/THTensorLapack.c
index 62d730a..fb1e246 100644
--- a/lib/TH/generic/THTensorLapack.c
+++ b/lib/TH/generic/THTensorLapack.c
@@ -103,12 +103,23 @@ static THTensor *THTensor_(cloneColumnMajor)(THTensor *self, THTensor *src)
 
 void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 {
+  int free_b = 0;
   if (a == NULL) a = ra_;
   if (b == NULL) b = rb_;
-  THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
-  THArgCheck(b->nDimension == 2, 1, "B should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
-  THArgCheck(a->size[0] == b->size[0], 2, "A,b size incompatible");
+  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+      a->nDimension);
+  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->nDimension);
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+      a->size[0], a->size[1]);
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->nDimension == 1) {
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = 1;
+  }
 
   int n, nrhs, lda, ldb, info;
   THIntTensor *ipiv;
@@ -132,23 +143,36 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(rb__);
-                               THIntTensor_free(ipiv);),
+                               THIntTensor_free(ipiv);
+                               if (free_b) THTensor_(free)(b);),
                            "gesv", info, info);
 
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(freeCopyTo)(rb__, rb_);
   THIntTensor_free(ipiv);
+  if (free_b) THTensor_(free)(b);
 }
 
 void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
                       const char *uplo, const char *trans, const char *diag)
 {
+  int free_b = 0;
   if (a == NULL) a = ra_;
   if (b == NULL) b = rb_;
-  THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
-  THArgCheck(b->nDimension == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
-  THArgCheck(b->size[0] == a->size[0], 2, "A,b size incompatible");
+  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+      a->nDimension);
+  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->nDimension);
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+      a->size[0], a->size[1]);
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->nDimension == 1) {
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = 1;
+  }
 
   int n, nrhs, lda, ldb, info;
   THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
@@ -168,21 +192,35 @@ void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
 
 
   THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
-                           THCleanup(THTensor_(free)(ra__); THTensor_(free)(rb__);),
+                           THCleanup(
+                              THTensor_(free)(ra__);
+                              THTensor_(free)(rb__);
+                              if (free_b) THTensor_(free)(b);),
                            "trtrs", info, info);
 
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(freeCopyTo)(rb__, rb_);
+  if (free_b) THTensor_(free)(b);
 }
 
 void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 {
+  int free_b = 0;
   // Note that a = NULL is interpreted as a = ra_, and b = NULL as b = rb_.
   if (a == NULL) a = ra_;
   if (b == NULL) b = rb_;
-  THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
-  THArgCheck(b->nDimension == 2, 1, "B should be 2 dimensional");
-  THArgCheck(a->size[0] == b->size[0], 2, "size incompatible A,b");
+  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+      a->nDimension);
+  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->nDimension);
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->nDimension == 1) {
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = 1;
+  }
 
   int m, n, nrhs, lda, ldb, info, lwork;
   THTensor *work = NULL;
@@ -217,7 +255,8 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
   THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
                            THCleanup(THTensor_(free)(ra__);
                                      THTensor_(free)(rb__);
-                                     THTensor_(free)(work);),
+                                     THTensor_(free)(work);
+                                     if (free_b) THTensor_(free)(b);),
                            "gels", info,"");
 
   /* rb__ is currently ldb by nrhs; resize it to n by nrhs */
@@ -228,6 +267,7 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(freeCopyTo)(rb__, rb_);
   THTensor_(free)(work);
+  if (free_b) THTensor_(free)(b);
 }
 
 void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr)
@@ -312,6 +352,7 @@ void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz
 {
   if (a == NULL) a = rv_;
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1,"A should be square");
 
   int n, lda, lwork, info;
   THTensor *work;
@@ -562,9 +603,9 @@ void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo)
 
   /* Run Factorization */
   THLapack_(potrf)(uplo[0], n, THTensor_(data)(ra__), lda, &info);
-  THLapackCheckWithCleanup("Lapack Error %s : A(%d,%d) is 0, A cannot be factorized",
+  THLapackCheckWithCleanup("Lapack Error in %s : the leading minor of order %d is not positive definite",
                            THCleanup(THTensor_(free)(ra__);),
-                           "potrf", info, info);
+                           "potrf", info);
 
   THTensor_(clearUpLoTriangle)(ra__, uplo);
   THTensor_(freeCopyTo)(ra__, ra_);
@@ -572,9 +613,23 @@ void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo)
 
 void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
 {
+  int free_b = 0;
   if (b == NULL) b = rb_;
 
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
+      a->nDimension);
+  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->nDimension);
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
+      a->size[0], a->size[1]);
+  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size[0], b->size[0]);
+
+  if (b->nDimension == 1) {
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
+            b->stride[0], 1, 0);
+    free_b = 1;
+  }
 
   int n, nrhs, lda, ldb, info;
   THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
@@ -595,9 +650,11 @@ void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
   THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
                            THCleanup(
                                THTensor_(free)(ra__);
-                               THTensor_(free)(rb__);),
+                               THTensor_(free)(rb__);
+                               if (free_b) THTensor_(free)(b);),
                            "potrs", info, info);
 
+  if (free_b) THTensor_(free)(b);
   THTensor_(free)(ra__);
   THTensor_(freeCopyTo)(rb__, rb_);
 }
diff --git a/lib/TH/generic/THVector.h b/lib/TH/generic/THVector.h
index 5326b16..67fdcfa 100644
--- a/lib/TH/generic/THVector.h
+++ b/lib/TH/generic/THVector.h
@@ -9,6 +9,6 @@ TH_API void THVector_(scale)(real *y, const real c, const ptrdiff_t n);
 TH_API void THVector_(mul)(real *y, const real *x, const ptrdiff_t n);
 
 /* Initialize the dispatch pointers */
-TH_API void THVector_(vectorDispatchInit)();
+TH_API void THVector_(vectorDispatchInit)(void);
 
 #endif
diff --git a/lib/TH/generic/THVectorDispatch.c b/lib/TH/generic/THVectorDispatch.c
index eae5073..6fd1d68 100644
--- a/lib/TH/generic/THVectorDispatch.c
+++ b/lib/TH/generic/THVectorDispatch.c
@@ -127,7 +127,7 @@ void THVector_(mul)(real *y, const real *x, const ptrdiff_t n) {
  *       This means that in the dispatch tables, implementations supporting more recent extensions
  *       need to come first
  */
-void THVector_(vectorDispatchInit)()
+void THVector_(vectorDispatchInit)(void)
 {
   uint32_t hostSimdExts = detectHostSIMDExtensions();
   INIT_DISPATCH_PTR(fill);
diff --git a/lib/TH/vector/NEON.c b/lib/TH/vector/NEON.c
index bc7cb2b..327b006 100644
--- a/lib/TH/vector/NEON.c
+++ b/lib/TH/vector/NEON.c
@@ -1,252 +1,78 @@
 static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n) {
-  float ctemp = c;
-  float * caddr = &ctemp;
-  __asm__ __volatile__ (
-      "mov         r0, %0           @ \n\t"
-      "ldr         r4, [%1]         @ \n\t"
-      "vdup.32     q12, r4          @ \n\t"
-      "vdup.32     q13, r4          @ \n\t"
-      "lsrs        r4, %2, #3       @ \n\t"
-      "beq         3f               @ \n\t"
-      "1:                           @ \n\t"
-      "vst1.32     {d24-d27}, [r0]! @ \n\t"
-      "subs        r4, r4, #1       @ \n\t"
-      "bne         1b               @ \n\t"
-      "3:                           @ \n\t"
-      "ands        r4, %2, #7       @ \n\t"
-      "beq         5f               @ \n\t"
-      "4:                           @ \n\t"
-      "subs        r4, r4, #1       @ \n\t"
-      "vst1.32     {d24[0]}, [r0]!  @ \n\t"
-      "bne         4b               @ \n\t"
-      "5:                           @ "
-      :
-      :"r" (x), "r"(caddr),"r"(n)
-      : "cc", "r0", "r4",  "memory",
-        "q12",
-        "d24", "d25", "d26", "d27"
-      );
+  long i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    x[i] = c;
+    x[i+1] = c;
+    x[i+2] = c;
+    x[i+3] = c;
+  }
+
+  for(; i < n; i++)
+    x[i] = c;
+
 }
 
 
 static void THFloatVector_diff_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
-  __asm__ __volatile__ (
-      "mov         r0, %2           @ \n\t"
-      "mov         r1, %1           @ \n\t"
-      "mov         r2, %0           @ \n\t"
-      "lsrs        r4, %3, #3       @ \n\t"
-      "beq         3f               @ \n\t"
-      "vld1.32     {d16-d19}, [r1]! @ \n\t"
-      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
-      "1:                           @ \n\t"
-      "vsub.f32    q12, q8, q0      @ \n\t"
-      "vsub.f32    q13, q9, q1      @ \n\t"
-      "subs        r4, r4, #1       @ \n\t"
-      "beq         2f               @ \n\t"
-      "vld1.32     {d16-d19}, [r1]! @ \n\t"
-      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
-      "vst1.32     {d24-d27}, [r2]! @ \n\t"
-      "b           1b               @ \n\t"
-      "2:                           @ \n\t"
-      "vst1.32     {d24-d27}, [r2]! @ \n\t"
-      "3:                           @ \n\t"
-      "ands        r4, %3, #7       @ \n\t"
-      "beq         5f               @ \n\t"
-      "4:                           @ \n\t"
-      "subs        r4, r4, #1       @ \n\t"
-      "vld1.32     {d16[0]}, [r1]!  @ \n\t"
-      "vld1.32     {d0[0]}, [r0]!   @ \n\t"
-      "vsub.f32    d24, d16, d0     @ \n\t"
-      "vst1.32     {d24[0]}, [r2]!  @ \n\t"
-      "bne         4b               @ \n\t"
-      "5:                           @ "
-      :
-      :"r" (z), "r" (x),"r" (y), "r"(n)
-      : "cc", "r0", "r1", "r2", "r4", "memory",
-        "q0", "q1", "q8", "q9", "q12", "q13",
-        "d0", "d1", "d2", "d3",
-        "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
-      );
+  long i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    z[i] = x[i] - y[i];
+    z[i+1] = x[i+1] - y[i+1];
+    z[i+2] = x[i+2] - y[i+2];
+    z[i+3] = x[i+3] - y[i+3];
+  }
+
+  for(; i < n; i++)
+    z[i] = x[i] - y[i];
+
 }
 
 
 static void THFloatVector_scale_NEON(float *y, const float c, const ptrdiff_t n) {
-  float ctemp = c;
-  float * caddr = &ctemp;
-  __asm__ __volatile__ (
-      "mov         r0, %0           @ \n\t"
-      "mov         r2, r0           @ \n\t"
-      "ldr         r5, [%1]         @ \n\t"
-      "vdup.32     q14, r5          @ \n\t"
-      "lsrs        r5, %2, #5       @ \n\t"
-      "beq         3f               @ \n\t"
-      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
-      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
-      "vld1.32     {d8-d11}, [r0]!  @ \n\t"
-      "vld1.32     {d12-d15}, [r0]! @ \n\t"
-      "1:                           @ \n\t"
-      "vmul.f32    q0, q0, q14      @ \n\t"
-      "vmul.f32    q1, q1, q14      @ \n\t"
-      "vmul.f32    q2, q2, q14      @ \n\t"
-      "vmul.f32    q3, q3, q14      @ \n\t"
-      "vmul.f32    q4, q4, q14      @ \n\t"
-      "vmul.f32    q5, q5, q14      @ \n\t"
-      "vmul.f32    q6, q6, q14      @ \n\t"
-      "vmul.f32    q7, q7, q14      @ \n\t"
-      "subs        r5, r5, #1       @ \n\t"
-      "beq         2f               @ \n\t"
-      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
-      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
-      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
-      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
-      "vst1.32     {d8-d11}, [r2]!  @ \n\t"
-      "vld1.32     {d8-d11}, [r0]!  @ \n\t"
-      "vst1.32     {d12-d15}, [r2]! @ \n\t"
-      "vld1.32     {d12-d15}, [r0]! @ \n\t"
-      "b           1b               @ \n\t"
-      "2:                           @ \n\t"
-      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
-      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
-      "vst1.32     {d8-d11}, [r2]!  @ \n\t"
-      "vst1.32     {d12-d15}, [r2]! @ \n\t"
-      "3:                           @ \n\t"
-      "lsrs        r5, %2, #4       @ \n\t"
-      "ands        r5, r5, #1       @ \n\t"
-      "beq         4f               @ \n\t"
-      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
-      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
-      "vmul.f32    q0, q0, q14      @ \n\t"
-      "vmul.f32    q1, q1, q14      @ \n\t"
-      "vmul.f32    q2, q2, q14      @ \n\t"
-      "vmul.f32    q3, q3, q14      @ \n\t"
-      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
-      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
-      "4:                           @ \n\t"
-      "lsrs        r5, %2, #3       @ \n\t"
-      "ands        r5, r5, #1       @ \n\t"
-      "beq         5f               @ \n\t"
-      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
-      "vmul.f32    q0, q0, q14      @ \n\t"
-      "vmul.f32    q1, q1, q14      @ \n\t"
-      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
-      "5:                           @ \n\t"
-      "ands        r5, %2, #7       @ \n\t"
-      "beq         7f               @ \n\t"
-      "6:                           @ \n\t"
-      "subs        r5, r5, #1       @ \n\t"
-      "vld1.32     d0[0], [r0]!     @ \n\t"
-      "vmul.f32    d0, d0, d28      @ \n\t"
-      "vst1.32     d0[0], [r2]!     @ \n\t"
-      "bne         6b               @ \n\t"
-      "7:                           @ "
-      :
-      :"r" (y), "r"(caddr),"r"(n)
-      : "cc", "r0", "r2", "r5", "memory",
-        "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14",
-        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
-        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
-        "d28", "d29"
-      );
+  long i = 0;
 
+  for(; i < n-4; i +=4)
+  {
+    y[i] *= c;
+    y[i+1] *= c;
+    y[i+2] *= c;
+    y[i+3] *= c;
+  }
+
+  for(; i < n; i++)
+    y[i] *= c;
 }
 
 static void THFloatVector_mul_NEON(float *y, const float *x, const ptrdiff_t n) {
-  __asm__ __volatile__ (
-      "mov         r0, %0           @ \n\t"
-      "mov         r1, %1           @ \n\t"
-      "mov         r2, r0           @ \n\t"
-      "lsrs        r4, %2, #3       @ \n\t"
-      "beq         3f               @ \n\t"
-      "vld1.32     {d16-d19}, [r1]! @ \n\t"
-      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
-      "1:                           @ \n\t"
-      "vmul.f32    q12, q8, q0      @ \n\t"
-      "vmul.f32    q13, q9, q1      @ \n\t"
-      "subs        r4, r4, #1       @ \n\t"
-      "beq         2f               @ \n\t"
-      "vld1.32     {d16-d19}, [r1]! @ \n\t"
-      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
-      "vst1.32     {d24-d27}, [r2]! @ \n\t"
-      "b           1b               @ \n\t"
-      "2:                           @ \n\t"
-      "vst1.32     {d24-d27}, [r2]! @ \n\t"
-      "3:                           @ \n\t"
-      "ands        r4, %2, #7       @ \n\t"
-      "beq         5f               @ \n\t"
-      "4:                           @ \n\t"
-      "subs        r4, r4, #1       @ \n\t"
-      "vld1.32     {d16[0]}, [r1]!  @ \n\t"
-      "vld1.32     {d0[0]}, [r0]!   @ \n\t"
-      "vmul.f32    q12, q8, q0      @ \n\t"
-      "vst1.32     {d24[0]}, [r2]!  @ \n\t"
-      "bne         4b               @ \n\t"
-      "5:                           @ "
-      :
-      :"r" (y),"r" (x),"r"(n)
-      : "cc", "r0", "r1", "r2", "r4", "memory",
-        "q0", "q1", "q8", "q9", "q12", "q13",
-        "d0", "d1", "d2", "d3",
-        "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
-      );
+  long i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    y[i] *= x[i];
+    y[i+1] *= x[i+1];
+    y[i+2] *= x[i+2];
+    y[i+3] *= x[i+3];
+  }
+
+  for(; i < n; i++)
+    y[i] *= x[i];
 }
 
 static void THFloatVector_add_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
-  float ctemp = c;
-  float * caddr = &ctemp;
-  __asm__ __volatile__ (
-      "mov         r0, %0           @ \n\t"
-      "mov         r1, %1           @ \n\t"
-      "mov         r2, r0           @ \n\t"
-      "ldr         r5, [%2]         @ \n\t"
-      "vdup.32     q14, r5          @ \n\t"
-      "lsrs        r5, %3, #4       @ \n\t"
-      "beq         3f               @ \n\t"
-      "vld1.32     {d16-d19}, [r1]! @ \n\t"
-      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
-      "vld1.32     {d20-d23}, [r1]! @ \n\t"
-      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
-      "1:                           @ \n\t"
-      "vmla.f32    q0, q8, q14      @ \n\t"
-      "vmla.f32    q1, q9, q14      @ \n\t"
-      "vmla.f32    q2, q10, q14     @ \n\t"
-      "vmla.f32    q3, q11, q14     @ \n\t"
-      "subs        r5, r5, #1       @ \n\t"
-      "beq         2f               @ \n\t"
-      "vld1.32     {d16-d19}, [r1]! @ \n\t"
-      "vld1.32     {d20-d23}, [r1]! @ \n\t"
-      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
-      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
-      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
-      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
-      "b           1b               @ \n\t"
-      "2:                           @ \n\t"
-      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
-      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
-      "3:                           @ \n\t"
-      "lsrs        r5, %3, #3       @ \n\t"
-      "ands        r5, #1           @ \n\t"
-      "beq         4f               @ \n\t"
-      "vld1.32     {d16-d19}, [r1]! @ \n\t"
-      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
-      "vmla.f32    q0, q8, q14      @ \n\t"
-      "vmla.f32    q1, q9, q14      @ \n\t"
-      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
-      "4:                           @ \n\t"
-      "ands        r5, %3, #7       @ \n\t"
-      "beq         6f               @ \n\t"
-      "5:                           @ \n\t"
-      "subs        r5, r5, #1       @ \n\t"
-      "vld1.32     {d16[0]}, [r1]!  @ \n\t"
-      "vld1.32     {d0[0]}, [r0]!   @ \n\t"
-      "vmla.f32    d0, d16, d28     @ \n\t"
-      "vst1.32     d0[0], [r2]!     @ \n\t"
-      "bne         5b               @ \n\t"
-      "6:                           @ "
-      :
-      :"r" (y),"r" (x), "r"(caddr),"r"(n)
-      : "cc", "r0", "r1", "r2", "r5", "memory",
-        "q0", "q1", "q2", "q3", "q14",
-        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
-        "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29"
-      );
+  long i = 0;
+
+  for(;i < n-4; i += 4)
+  {
+    y[i] += c * x[i];
+    y[i+1] += c * x[i+1];
+    y[i+2] += c * x[i+2];
+    y[i+3] += c * x[i+3];
+  }
+
+  for(; i < n; i++)
+    y[i] += c * x[i];
 }
diff --git a/test/test.lua b/test/test.lua
index 4290036..3eb119f 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -3435,6 +3435,17 @@ function torchtest.bernoulli()
   mytester:assert(isBinary(t), 'Sample from torch.bernoulli is not binary')
 end
 
+function torchtest.logNormal()
+    local t = torch.FloatTensor(10, 10)
+    local mean, std = torch.uniform(), 0.1 * torch.uniform()
+    local tolerance = 0.02
+
+    t:logNormal(mean, std)
+    local logt = t:log()
+    mytester:assertalmosteq(logt:mean(), mean, tolerance, 'mean is wrong')
+    mytester:assertalmosteq(logt:std(), std, tolerance, 'tolerance is wrong')
+end
+
 function torch.test(tests)
    torch.setheaptracking(true)
    math.randomseed(os.time())
diff --git a/utils.c b/utils.c
index eb7ff53..894bb6e 100644
--- a/utils.c
+++ b/utils.c
@@ -7,10 +7,6 @@
 # include <sys/time.h>
 #endif
 
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
 THLongStorage* torch_checklongargs(lua_State *L, int index)
 {
   THLongStorage *storage;
@@ -171,30 +167,19 @@ const char* torch_getdefaulttensortype(lua_State *L)
 
 static int torch_getnumthreads(lua_State *L)
 {
-#ifdef _OPENMP
-  lua_pushinteger(L, omp_get_max_threads());
-#else
-  lua_pushinteger(L, 1);
-#endif
+  lua_pushinteger(L, THGetNumThreads());
   return 1;
 }
 
 static int torch_setnumthreads(lua_State *L)
 {
-#ifdef _OPENMP
-  int nth = luaL_checkint(L,1);
-  omp_set_num_threads(nth);
-#endif
+  THSetNumThreads(luaL_checkint(L, 1));
   return 0;
 }
 
 static int torch_getnumcores(lua_State *L)
 {
-#ifdef _OPENMP
-  lua_pushinteger(L, omp_get_num_procs());
-#else
-  lua_pushinteger(L, 1);
-#endif
+  lua_pushinteger(L, THGetNumCores());
   return 1;
 }
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-torch7.git