[lua-torch-torch7] 01/07: New upstream version 0~20170720-gaed3171

Sun Jul 30 06:17:24 UTC 2017

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-torch7.

commit 7fd65a4bd1fc8c3ee5c9b7edf1c5d41ae2748fc0
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Fri Jul 28 09:48:14 2017 +0000

    New upstream version 0~20170720-gaed3171
---
 TensorMath.lua                  |  36 +++++++---
 cmake/TorchPackage.cmake        |  33 +++++----
 doc/file.md                     |  14 ++--
 doc/maths.md                    |  61 ++++++++++++++++-
 doc/pipefile.md                 |   4 +-
 doc/random.md                   |   4 +-
 doc/tensor.md                   |   2 +-
 doc/tester.md                   |  42 ++++++------
 doc/timer.md                    |   6 +-
 generic/Tensor.c                |   8 +--
 generic/TensorOperator.c        |  16 +++--
 init.lua                        |  16 ++++-
 lib/TH/CMakeLists.txt           |  51 +++++++++++---
 lib/TH/TH.h                     |   1 +
 lib/TH/THAllocator.c            |   7 ++
 lib/TH/THDiskFile.c             |   3 +
 lib/TH/THGeneral.c              |  38 +++++++++++
 lib/TH/THGeneral.h.in           |   8 +++
 lib/TH/THSize.c                 |  26 +++++++
 lib/TH/THSize.h                 |  13 ++++
 lib/TH/THStorage.c              | 137 +++++++++++++++++++++++++------------
 lib/TH/THStorage.h              |  16 +++--
 lib/TH/THTensorApply.h          |  32 +++++++--
 lib/TH/THTensorDimApply.h       |  52 ++++++++++----
 lib/TH/cmake/FindBLAS.cmake     |  16 +++++
 lib/TH/cmake/FindMKL.cmake      |  27 ++++++--
 lib/TH/generic/THBlas.c         |  11 +++
 lib/TH/generic/THTensor.c       |  65 ++++++++++++++++--
 lib/TH/generic/THTensor.h       |   3 +
 lib/TH/generic/THTensorCopy.c   |  71 ++++++++++++++++++-
 lib/TH/generic/THTensorLapack.c |   5 +-
 lib/TH/generic/THTensorLapack.h |   2 +-
 lib/TH/generic/THTensorMath.c   | 146 +++++++++++++++++++++++++++++++++++++---
 lib/TH/generic/THTensorMath.h   |  11 +--
 lib/TH/generic/THTensorRandom.c | 110 ++++++++++++++++++++++++++++++
 lib/TH/generic/THTensorRandom.h |   2 +
 lib/luaT/CMakeLists.txt         |  11 ++-
 test/test.lua                   |  23 +++++++
 test/test_aliasMultinomial.lua  |  40 +++++++++++
 39 files changed, 985 insertions(+), 184 deletions(-)

diff --git a/TensorMath.lua b/TensorMath.lua
index 45e07c6..1e224c8 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -1096,7 +1096,9 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
          wrap(name,
               cname(name .. "all"),
               {{name=Tensor},
-               {name=accreal, creturned=true}},
+               {name="boolean", default=false},
+               {name=accreal, creturned=true}
+              },
               cname(name),
               {{name=Tensor, default=true, returned=true},
                {name=Tensor},
@@ -1269,16 +1271,30 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
       wrap("multinomial",
            cname("multinomial"),
            {{name="IndexTensor", default=true, returned=true, method={default='nil'}},
-            {name='Generator', default=true},
-            {name=Tensor},
-            {name="int"},
-            {name="boolean", default=false}})
-
+              {name='Generator', default=true},
+              {name=Tensor},
+              {name="int"},
+              {name="boolean", default=false}})
+      
+      wrap("multinomialAliasSetup_",
+           cname("multinomialAliasSetup"),
+           {{name=Tensor},
+              {name="IndexTensor", default=true, returned=true, method={default='nil'}},
+              {name=Tensor, default=true, returned=true, method={default='nil'}}})
+      
+      wrap("multinomialAlias_",
+           cname("multinomialAliasDraw"),
+           {{name="IndexTensor", default=true, returned=true, method={default='nil'}},
+              {name='Generator', default=true},
+              {name="IndexTensor"},
+              {name=Tensor}
+              })
+      
       for _,f in ipairs({{name='uniform', a=0, b=1},
-                         {name='normal', a=0, b=1},
-                         {name='cauchy', a=0, b=1},
-                         {name='logNormal', a=1, b=2}}) do
-
+            {name='normal', a=0, b=1},
+            {name='cauchy', a=0, b=1},
+            {name='logNormal', a=1, b=2}}) do
+         
          wrap(f.name,
               string.format("THRandom_%s", f.name),
               {{name='Generator', default=true},
diff --git a/cmake/TorchPackage.cmake b/cmake/TorchPackage.cmake
index 7fcbdff..f966dac 100644
--- a/cmake/TorchPackage.cmake
+++ b/cmake/TorchPackage.cmake
@@ -1,5 +1,21 @@
 # -*- cmake -*-
 
+MACRO(ADD_TORCH_LIBRARY package type src)
+  IF ("${type}" STREQUAL "STATIC")
+    if ("${src}" MATCHES "cu$" OR "${src}" MATCHES "cu;")
+      CUDA_ADD_LIBRARY(${package} STATIC ${src})
+    else()
+      ADD_LIBRARY(${package} STATIC ${src})
+    endif()
+  ELSE()
+    if ("${src}" MATCHES "cu$" OR "${src}" MATCHES "cu;")
+      CUDA_ADD_LIBRARY(${package} ${type} ${src})
+    else()
+      ADD_LIBRARY(${package} ${type} ${src})
+    endif()
+  ENDIF()
+ENDMACRO()
+
 MACRO(ADD_TORCH_PACKAGE package src luasrc)
   INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
   INCLUDE_DIRECTORIES(${Torch_LUA_INCLUDE_DIR})
@@ -8,17 +24,7 @@ MACRO(ADD_TORCH_PACKAGE package src luasrc)
  # As per CMake doc, macro arguments are not variables, so simple test syntax not working
   IF(NOT "${src}" STREQUAL "")
 
-    if ("${src}" MATCHES "cu$" OR "${src}" MATCHES "cu;")
-      CUDA_ADD_LIBRARY(${package} MODULE ${src})
-      if(BUILD_STATIC)
-        CUDA_ADD_LIBRARY(${package}_static STATIC ${src})
-      endif()
-    else()
-      ADD_LIBRARY(${package} MODULE ${src})
-      if(BUILD_STATIC)
-        ADD_LIBRARY(${package}_static STATIC ${src})
-      endif()
-    endif()
+    ADD_TORCH_LIBRARY(${package} MODULE "${src}")
 
     ### Torch packages supposes libraries prefix is "lib"
     SET_TARGET_PROPERTIES(${package} PROPERTIES
@@ -31,12 +37,13 @@ MACRO(ADD_TORCH_PACKAGE package src luasrc)
         LINK_FLAGS "-undefined dynamic_lookup")
     ENDIF()
 
-    if(BUILD_STATIC)
+    IF (BUILD_STATIC OR "$ENV{STATIC_TH}" STREQUAL "YES")
+      ADD_TORCH_LIBRARY(${package}_static STATIC "${src}")
       SET_TARGET_PROPERTIES(${package}_static PROPERTIES
         COMPILE_FLAGS "-fPIC")
       SET_TARGET_PROPERTIES(${package}_static PROPERTIES
         PREFIX "lib" IMPORT_PREFIX "lib" OUTPUT_NAME "${package}")
-    endif()
+    ENDIF()
 
     INSTALL(TARGETS ${package}
       RUNTIME DESTINATION ${Torch_INSTALL_LUA_CPATH_SUBDIR}
diff --git a/doc/file.md b/doc/file.md
index c4aa742..accaa35 100644
--- a/doc/file.md
+++ b/doc/file.md
@@ -127,8 +127,8 @@ Serializable objects are `Torch` objects having a `read()` and
 
 If the object to save contains several other objects (let say it is a tree
 of objects), then objects appearing several times in this tree will be
-_saved only once_. This saves disk space, speedup loading/saving and
-respect the dependencies between objects.
+_saved only once_. This saves disk space, speeds up loading/saving and
+respects the dependencies between objects.
 
 Interestingly, if the `File` is a [MemoryFile](memoryfile.md), it allows
 the user to easily make a _clone_ of any serializable object:
@@ -188,7 +188,7 @@ If the object has been already written in the file, only a _reference_ to
 this already saved object will be written: this saves space an speed-up
 writing; it also allows to keep the dependencies between objects intact.
 
-In returns, if one writes an object, modify its member, and write the
+In returns, if one writes an object, modifies its member, and writes the
 object again in the same file, the modifications will not be recorded
 in the file, as only a reference to the original will be written. See
 [readObject()](#torch.File.readObject) for an example.
@@ -196,14 +196,14 @@ in the file, as only a reference to the original will be written. See
 <a name="torch.File.readString"></a>
 ### [string] readString(format) ###
 
-If `format` starts with ''"*l"` then returns the next line in the `File''. The end-of-line character is skipped.
+If `format` starts with `"*l"` then returns the next line in the `File`. The end-of-line character is skipped.
 
-If `format` starts with ''"*a"` then returns all the remaining contents of the `File''.
+If `format` starts with `"*a"` then returns all the remaining contents of the `File`.
 
 If no data is available, then an error is raised, except if `File` is in [quiet()](#torch.File.quiet) mode where
 it then returns an empty string `''` and after that you'll be able to see that last reading failed due to end of file with your_file:[hasError()](#torch.File.hasError).
 
-Because Torch is more precise on number typing, the `Lua` format ''"*n"'' is not supported:
+Because Torch is more precise on number typing, the `Lua` format `"*n"` is not supported:
 instead use one of the [number read methods](#torch.File.read).
 
 <a name="torch.File.writeString"></a>
@@ -361,4 +361,4 @@ behaviour.
 <a name="torch.File.isReferenced"></a>
 ### isReferenced() ###
 
-Return the state set by [referenced](#torch.File.referenced).
+Returns the state set by [referenced](#torch.File.referenced).
diff --git a/doc/maths.md b/doc/maths.md
index eb9f5cf..9cf29bc 100755
--- a/doc/maths.md
+++ b/doc/maths.md
@@ -170,7 +170,7 @@ By default the elements are sorted into 100 equally spaced bins between the mini
 `y = torch.bhistc(x, n, min, max)` same as above with `n` bins and `[min, max]` as elements range.
 
 ```lua
-x =torch.Tensor(3, 6)
+x = torch.Tensor(3, 6)
 
 > x[1] = torch.Tensor{ 2, 4, 2, 2, 5, 4 }
 > x[2] = torch.Tensor{ 3, 5, 1, 5, 3, 5 }
@@ -255,6 +255,62 @@ p.multinomial(res, p, n, replacement) -- p.multinomial instead of torch.multinom
 
 This is due to the fact that the result here is of a `LongTensor` type, and we do not define a `torch.multinomial` over long `Tensor`s.
 
+<a name="torch.multinomialAlias()"></a>
+### [state] torch.multinomialAliasSetup(probs) ###
+### [res] torch.multinomialAlias(output, state)
+`state = torch.multinomialAliasSetup(probs)` returns a table `state` consisting of two `tensors` : `probability table` and an `alias table`. This is required once for each `probs` vectors. We can then sample from the multinomial distribution multiple times by consulting these tensors `state` table.
+
+`torch.multinomialAlias(output, state)` returns `output` filled with indices drawn from the multinomial distribution `probs`. `output` itself is filled with the indices and it is not necessary to get the return value of the statement.
+
+The sampling is done through a technique defined in a very simple way in this blog about [The Alias Method](https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/). The paper that describes this technique is present [here](http://www.tandfonline.com/doi/abs/10.1080/00031305.1979.10482697). This can only sample with replacement.
+
+The `output` `Tensor` that is fed into the `multinomialAlias` method need not be contiguous. The `output` tensor can only be a 1d tensor. If you are required to fill a nd tensor enter a 1d view of the same tensor. This method is exceptionally faster than `torch.multinomial` when you want to sample a lot of samples from the same distrbution or sample from the same distribution a large number of times. `torch.multinomial` is faster for sampling few samples from a distribution once because  [...]
+
+```lua
+> state = torch.multinomialAliasSetup(probs)
+> state
+{
+  1 : LongTensor - size: 4
+  2 : DoubleTensor - size: 4
+}
+> output = torch.LongTensor(2,3)
+> torch.multinomialAlias(output:view(-1), state)
+ 4
+ 1
+ 2
+ 3
+ 2
+ 2
+[torch.LongTensor of size 6]
+> output
+ 4  1  2
+ 3  2  2
+[torch.LongTensor of size 2x3]
+```
+
+You can also allocate memory and reuse it for the state table.
+
+```lua
+> state = {torch.LongTensor(), torch.DoubleTensor()}
+> probs = torch.DoubleTensor({0.2, 0.3, 0.5})
+> state = torch.multinomialAliasSetup(probs, state)
+> state
+{
+  1 : LongTensor - size: 3
+  2 : DoubleTensor - size: 3
+}
+> output = torch.LongTensor(7)
+> torch.multinomialAlias(output, state)
+ 2
+ 2
+ 3
+ 1
+ 2
+ 2
+ 2
+[torch.LongTensor of size 7]
+```
+
 <a name="torch.ones"></a>
 ### [res] torch.ones([res,] m [,n...]) ###
 <a name="torch.ones"></a>
@@ -411,7 +467,8 @@ For more than 4 dimensions, you can use a storage: `y = torch.zeros(torch.LongSt
 ### [res] torch.atan2([res,] x, y) ###
 <a name="torch.atan2"></a>
 
-`y = torch.atan2(x, y)` returns a new `Tensor` with the arctangent of the elements of `x` and `y`.
+`y = torch.atan2(x, y)` returns a new `Tensor` with the arctangent of the elements of `x` and `y`. 
+Note that the arctangent of the elements `x` and `y` refers to the signed angle in radians between the rays ending at origin where the first one starts at (1, 0) and the second at (y, x).
 
 `x:atan2()` replaces all elements in-place with the arctangent of the elements of `x` and `y`.
 
diff --git a/doc/pipefile.md b/doc/pipefile.md
index fdba14c..15b9cda 100644
--- a/doc/pipefile.md
+++ b/doc/pipefile.md
@@ -13,8 +13,8 @@ given to the [torch.PipeFile(fileName, mode)](#torch.PipeFile). Read-write mode
 <a name="torch.PipeFile"></a>
 ### torch.PipeFile(command, [mode], [quiet]) ###
 
-_Constructor_ which execute `command` by opening a pipe in read or write
-`mode`. Valid `mode` are `"r"` (read) or `"w"` (write). Default is read
+_Constructor_ which executes `command` by opening a pipe in read or write
+`mode`. Valid `mode`s are `"r"` (read) or `"w"` (write). Default is read
 mode.
 
 If (and only if) `quiet` is `true`, no error will be raised in case of
diff --git a/doc/random.md b/doc/random.md
index 2bb2d1f..235c6f6 100644
--- a/doc/random.md
+++ b/doc/random.md
@@ -120,10 +120,10 @@ random numbers is produced.
 
 <a name="torch.setRNGState"></a>
 ### [Tensor] setRNGState([gen,] state) ###
-Set the state of the random number generator. If `state` was obtained earlier
+Sets the state of the random number generator. If `state` was obtained earlier
 using `getRNGState` then the random number generator should now generate the
 same numbers as it did from the point where `state` was obtained. This function
-returns its argument, `state`.
+returns its argument `state`.
 
 <a name="torch.random"></a>
 ### [number] random([gen,] [a], [b]) ###
diff --git a/doc/tensor.md b/doc/tensor.md
index d18af99..75eaebc 100644
--- a/doc/tensor.md
+++ b/doc/tensor.md
@@ -2122,7 +2122,7 @@ for i=1,7 do x[i] = i end
 
 These functions apply a function to each element of the tensor on which called the
 method (self). These methods are much faster than using a `for`
-loop in `Lua`. The results is stored in `self` (if the function returns
+loop in `Lua`. The results are stored in `self` (if the function returns
 something).
 
 <a name="torch.Tensor.apply"></a>
diff --git a/doc/tester.md b/doc/tester.md
index eab061a..da11b32 100644
--- a/doc/tester.md
+++ b/doc/tester.md
@@ -89,7 +89,7 @@ Returns a new instance of `torch.Tester` class.
 <a name="torch.Tester.add"></a>
 ### add(f, 'name') ###
 
-Add `f`, either a test function or a table of test functions, to the tester.
+Adds `f`, either a test function or a table of test functions, to the tester.
 
 If `f` is a function then names should be unique. There are a couple of special
 values for `name`: if it is `_setUp` or `_tearDown`, then the function will be
@@ -105,7 +105,7 @@ Returns the torch.Tester instance.
 <a name="torch.Tester.run"></a>
 ### run(testNames) ###
 
-Run tests that have been added by [add(f, 'name')](#torch.Tester.add).
+Runs tests that have been added by [add(f, 'name')](#torch.Tester.add).
 While running it reports progress, and at the end gives a summary of all errors.
 
 If a list of names `testNames` is passed, then all tests matching these names
@@ -120,7 +120,7 @@ tester:run({"test2", "test3"}) -- runs the tests named "test2" and "test3"
 <a name="torch.Tester.disable"></a>
 ### disable(testNames) ###
 
-Prevent the given tests from running, where `testNames` can be a single string
+Prevents the given tests from running, where `testNames` can be a single string
 or list of strings. More precisely, when [run](#torch.Tester.run)
 is invoked, it will skip these tests, while still printing out an indication of
 skipped tests. This is useful for temporarily disabling tests without
@@ -149,7 +149,7 @@ Completed 0 asserts in 1 test with 0 failures and 0 errors and 1 disabled
 <a name="torch.Tester.assert"></a>
 ### assert(condition [, message]) ###
 
-Check that `condition` is true (using the optional `message` if the test
+Checks that `condition` is true (using the optional `message` if the test
 fails).
 Returns whether the test passed.
 
@@ -159,7 +159,7 @@ Returns whether the test passed.
 General equality check between numbers, tables, strings, `torch.Tensor`
 objects, `torch.Storage` objects, etc.
 
-Check that `got` and `expected` have the same contents, where tables are
+Checks that `got` and `expected` have the same contents, where tables are
 compared recursively, tensors and storages are compared elementwise, and numbers
 are compared within `tolerance` (default value `0`). Other types are compared by
 strict equality. The optional `message` is used if the test fails.
@@ -177,7 +177,7 @@ Convenience function; does the same as
 General inequality check between numbers, tables, strings, `torch.Tensor`
 objects, `torch.Storage` objects, etc.
 
-Check that `got` and `unexpected` have different contents, where tables are
+Checks that `got` and `unexpected` have different contents, where tables are
 compared recursively, tensors and storages are compared elementwise, and numbers
 are compared within `tolerance` (default value `0`). Other types are compared by
 strict equality. The optional `message` is used if the test fails.
@@ -192,35 +192,35 @@ Convenience function; does the same as
 <a name="torch.Tester.assertlt"></a>
 ### assertlt(a, b [, message]) ###
 
-Check that `a < b` (using the optional `message` if the test fails),
+Checks that `a < b` (using the optional `message` if the test fails),
 where `a` and `b` are numbers.
 Returns whether the test passed.
 
 <a name="torch.Tester.assertgt"></a>
 ### assertgt(a, b [, message]) ###
 
-Check that `a > b` (using the optional `message` if the test fails),
+Checks that `a > b` (using the optional `message` if the test fails),
 where `a` and `b` are numbers.
 Returns whether the test passed.
 
 <a name="torch.Tester.assertle"></a>
 ### assertle(a, b [, message]) ###
 
-Check that `a <= b` (using the optional `message` if the test fails),
+Checks that `a <= b` (using the optional `message` if the test fails),
 where `a` and `b` are numbers.
 Returns whether the test passed.
 
 <a name="torch.Tester.assertge"></a>
 ### assertge(a, b [, message]) ###
 
-Check that `a >= b` (using the optional `message` if the test fails),
+Checks that `a >= b` (using the optional `message` if the test fails),
 where `a` and `b` are numbers.
 Returns whether the test passed.
 
 <a name="torch.Tester.asserteq"></a>
 ### asserteq(a, b [, message]) ###
 
-Check that `a == b` (using the optional `message` if the test fails).
+Checks that `a == b` (using the optional `message` if the test fails).
 Note that this uses the generic lua equality check, so objects such as tensors
 that have the same content but are distinct objects will fail this test;
 consider using [assertGeneralEq()](#torch.Tester.assertGeneralEq) instead.
@@ -229,7 +229,7 @@ Returns whether the test passed.
 <a name="torch.Tester.assertne"></a>
 ### assertne(a, b [, message]) ###
 
-Check that `a ~= b` (using the optional `message` if the test fails).
+Checks that `a ~= b` (using the optional `message` if the test fails).
 Note that this uses the generic lua inequality check, so objects such as tensors
 that have the same content but are distinct objects will pass this test;
 consider using [assertGeneralNe()](#torch.Tester.assertGeneralNe) instead.
@@ -238,7 +238,7 @@ Returns whether the test passed.
 <a name="torch.Tester.assertalmosteq"></a>
 ### assertalmosteq(a, b [, tolerance] [, message]) ###
 
-Check that `|a - b| <= tolerance` (using the optional `message` if the
+Checks that `|a - b| <= tolerance` (using the optional `message` if the
 test fails), where `a` and `b` are numbers, and `tolerance` is an optional
 number (default `1e-16`).
 Returns whether the test passed.
@@ -246,7 +246,7 @@ Returns whether the test passed.
 <a name="torch.Tester.assertTensorEq"></a>
 ### assertTensorEq(ta, tb [, tolerance] [, message]) ###
 
-Check that `max(abs(ta - tb)) <= tolerance` (using the optional `message`
+Checks that `max(abs(ta - tb)) <= tolerance` (using the optional `message`
 if the test fails), where `ta` and `tb` are tensors, and `tolerance` is an
 optional number (default `1e-16`). Tensors that are different types or sizes
 will cause this check to fail.
@@ -255,7 +255,7 @@ Returns whether the test passed.
 <a name="torch.Tester.assertTensorNe"></a>
 ### assertTensorNe(ta, tb [, tolerance] [, message]) ###
 
-Check that `max(abs(ta - tb)) > tolerance` (using the optional `message`
+Checks that `max(abs(ta - tb)) > tolerance` (using the optional `message`
 if the test fails), where `ta` and `tb` are tensors, and `tolerance` is an
 optional number (default `1e-16`). Tensors that are different types or sizes
 will cause this check to pass.
@@ -264,7 +264,7 @@ Returns whether the test passed.
 <a name="torch.Tester.assertTableEq"></a>
 ### assertTableEq(ta, tb [, tolerance] [, message]) ###
 
-Check that the two tables have the same contents, comparing them
+Checks that the two tables have the same contents, comparing them
 recursively, where objects such as tensors are compared using their contents.
 Numbers (such as those appearing in tensors) are considered equal if
 their difference is at most the given tolerance.
@@ -272,7 +272,7 @@ their difference is at most the given tolerance.
 <a name="torch.Tester.assertTableNe"></a>
 ### assertTableNe(ta, tb [, tolerance] [, message]) ###
 
-Check that the two tables have distinct contents, comparing them
+Checks that the two tables have distinct contents, comparing them
 recursively, where objects such as tensors are compared using their contents.
 Numbers (such as those appearing in tensors) are considered equal if
 their difference is at most the given tolerance.
@@ -280,7 +280,7 @@ their difference is at most the given tolerance.
 <a name="torch.Tester.assertError"></a>
 ### assertError(f [, message]) ###
 
-Check that calling `f()` (via `pcall`) raises an error (using the
+Checks that calling `f()` (via `pcall`) raises an error (using the
 optional `message` if the test fails).
 Returns whether the test passed.
 
@@ -294,14 +294,14 @@ Returns whether the test passed.
 <a name="torch.Tester.assertErrorMsg"></a>
 ### assertErrorMsg(f, errmsg [, message]) ###
 
-Check that calling `f()` (via `pcall`) raises an error with the specific error
+Checks that calling `f()` (via `pcall`) raises an error with the specific error
 message `errmsg` (using the optional `message` if the test fails).
 Returns whether the test passed.
 
 <a name="torch.Tester.assertErrorPattern"></a>
 ### assertErrorPattern(f, errPattern [, message]) ###
 
-Check that calling `f()` (via `pcall`) raises an error matching `errPattern`
+Checks that calling `f()` (via `pcall`) raises an error matching `errPattern`
 (using the optional `message` if the test fails).
 The matching is done using `string.find`; in particular substrings will match.
 Returns whether the test passed.
@@ -309,7 +309,7 @@ Returns whether the test passed.
 <a name="torch.Tester.assertErrorObj"></a>
 ### assertErrorObj(f, errcomp [, message]) ###
 
-Check that calling `f()` (via `pcall`) raises an error object `err` such that
+Checks that calling `f()` (via `pcall`) raises an error object `err` such that
 calling `errcomp(err)` returns true (using the optional `message` if the test
 fails).
 Returns whether the test passed.
diff --git a/doc/timer.md b/doc/timer.md
index aa6aba2..1274dd7 100644
--- a/doc/timer.md
+++ b/doc/timer.md
@@ -22,19 +22,19 @@ Returns a new `Timer`. The timer starts to count the time now.
 <a name="torch.Timer.reset"></a>
 ### [self] reset() ###
 
-Reset the timer accumulated time to `0`. If the timer was running, the timer
+Resets the timer accumulated time to `0`. If the timer was running, the timer
 restarts to count the time now. If the timer was stopped, it stays stopped.
 
 <a name="torch.Timer.resume"></a>
 ### [self] resume() ###
 
-Resume a stopped timer. The timer restarts to count the time, and addition
+Resumes a stopped timer. The timer restarts to count the time, and addition
 the accumulated time with the time already counted before being stopped.
 
 <a name="torch.Timer.stop"></a>
 ### [self] stop() ###
 
-Stop the timer. The accumulated time counted until now is stored.
+Stops the timer. The accumulated time counted until now is stored.
 
 <a name="torch.Timer.time"></a>
 ### [table] time() ###
diff --git a/generic/Tensor.c b/generic/Tensor.c
index aabbbdc..112a4bd 100644
--- a/generic/Tensor.c
+++ b/generic/Tensor.c
@@ -142,7 +142,7 @@ static int torch_Tensor_(new)(lua_State *L)
           THTensor_(free)(tensor);
           THError("invalid element (not a number)");
         }
-        THStorage_(set)(THTensor_(storage)(tensor), si++, LUA_NUMBER_TO_REAL(lua_tonumber(L, -1)));
+        THStorage_(set)(THTensor_(storage)(tensor), si++, luaG_(checkreal)(L, -1));
         lua_pop(L, 1);
       }
 
@@ -1172,7 +1172,7 @@ static int torch_Tensor_(apply)(lua_State *L)
                   lua_call(L, 1, 1);
                   if(lua_isnumber(L, 3))
                   {
-                    *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 3));
+                    *tensor_data = luaG_(checkreal)(L, 3);
                     lua_pop(L, 1);
                   }
                   else if(lua_isnil(L, 3))
@@ -1198,7 +1198,7 @@ static int torch_Tensor_(map)(lua_State *L)
                   lua_call(L, 2, 1);
                   if(lua_isnumber(L, 4))
                   {
-                    *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 4));
+                    *tensor_data = luaG_(checkreal)(L, 4);
                     lua_pop(L, 1);
                   }
                   else if(lua_isnil(L, 4))
@@ -1226,7 +1226,7 @@ static int torch_Tensor_(map2)(lua_State *L)
                   lua_call(L, 3, 1);
                   if(lua_isnumber(L, 5))
                   {
-                    *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 5));
+                    *tensor_data = luaG_(checkreal)(L, 5);
                     lua_pop(L, 1);
                   }
                   else if(lua_isnil(L, 5))
diff --git a/generic/TensorOperator.c b/generic/TensorOperator.c
index e131c57..37b2a08 100644
--- a/generic/TensorOperator.c
+++ b/generic/TensorOperator.c
@@ -2,6 +2,8 @@
 #define TH_GENERIC_FILE "generic/TensorOperator.c"
 #else
 
+#include "luaG.h"
+
 static int torch_TensorOperator_(__add__)(lua_State *L)
 {
   THTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor);
@@ -19,13 +21,13 @@ static int torch_TensorOperator_(__add__)(lua_State *L)
     {
       THTensor_(resizeAs)(r, tensor2);
       THTensor_(copy)(r, tensor2);
-      THTensor_(add)(r, r, luaL_checknumber(L, 1));
+      THTensor_(add)(r, r, luaG_(checkreal)(L, 1));
     }
     else if(tensor1 && !tensor2)
     {
       THTensor_(resizeAs)(r, tensor1);
       THTensor_(copy)(r, tensor1);
-      THTensor_(add)(r, r, luaL_checknumber(L, 2));
+      THTensor_(add)(r, r, luaG_(checkreal)(L, 2));
     }
     else
     {
@@ -53,14 +55,14 @@ static int torch_TensorOperator_(__sub__)(lua_State *L)
     if(!tensor1 && tensor2)
     {
       THTensor_(resizeAs)(r, tensor2);
-      THTensor_(fill)(r, luaL_checknumber(L, 1));
+      THTensor_(fill)(r, luaG_(checkreal)(L, 1));
       THTensor_(cadd)(r, r, -1, tensor2);
     }
     else if(tensor1 && !tensor2)
     {
       THTensor_(resizeAs)(r, tensor1);
       THTensor_(copy)(r, tensor1);
-      THTensor_(add)(r, r, -(real)luaL_checknumber(L, 2));
+      THTensor_(add)(r, r, -luaG_(checkreal)(L, 2));
     }
     else
     {
@@ -103,13 +105,13 @@ static int torch_TensorOperator_(__mul__)(lua_State *L)
     {
       THTensor_(resizeAs)(r, tensor2);
       THTensor_(copy)(r, tensor2);
-      THTensor_(mul)(r, r, luaL_checknumber(L, 1));
+      THTensor_(mul)(r, r, luaG_(checkreal)(L, 1));
     }
     else if(tensor1 && !tensor2)
     {
       THTensor_(resizeAs)(r, tensor1);
       THTensor_(copy)(r, tensor1);
-      THTensor_(mul)(r, r, luaL_checknumber(L, 2));
+      THTensor_(mul)(r, r, luaG_(checkreal)(L, 2));
     }
     else
     {
@@ -117,7 +119,7 @@ static int torch_TensorOperator_(__mul__)(lua_State *L)
       int dims = tensor2->nDimension;
 
       if(dimt == 1 && dims == 1)
-        lua_pushnumber(L, THTensor_(dot)(tensor1, tensor2)); /* ok, we wasted r, but who cares */
+        luaG_(pushreal)(L, THTensor_(dot)(tensor1, tensor2)); /* ok, we wasted r, but who cares */
       else if(dimt == 2 && dims == 1)
       {
         THTensor_(resize1d)(r, tensor1->size[0]);
diff --git a/init.lua b/init.lua
index 0f3cfbb..b1a0012 100644
--- a/init.lua
+++ b/init.lua
@@ -159,7 +159,6 @@ require('torch.FFInterface')
 require('torch.Tester')
 require('torch.TestSuite')
 require('torch.test')
-
 function torch.totable(obj)
    if torch.isTensor(obj) or torch.isStorage(obj) then
       return obj:totable()
@@ -189,4 +188,19 @@ torch.Tensor.isTensor = torch.isTensor
 -- remove this line to disable automatic heap-tracking for garbage collection
 torch.setheaptracking(true)
 
+function torch.multinomialAliasSetup(probs, state)
+   if torch.type(state) == 'table' then 
+      state[1], state[2] = torch.multinomialAliasSetup_(probs, state[1], state[2])
+   else
+      state = {}
+      state[1], state[2] = torch.multinomialAliasSetup_(probs)
+   end
+   return state
+end
+
+function torch.multinomialAlias(output, state)
+   torch.DoubleTensor.multinomialAlias_(output, state[1], state[2])
+   return output
+end
+
 return torch
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index c4e6694..c9be21a 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -20,6 +20,27 @@ IF(NOT TH_INSTALL_BIN_SUBDIR
   SET(TH_INSTALL_CMAKE_SUBDIR "share/cmake/TH" CACHE PATH "TH install cmake subdirectory")
 ENDIF()
 
+######################################################################
+###### macros section
+#####################################################################
+IF(NOT ADD_TORCH_LIBRARY)
+MACRO(ADD_TORCH_LIBRARY package type src)
+  IF ("${type}" STREQUAL "STATIC")
+    if ("${src}" MATCHES "cu$" OR "${src}" MATCHES "cu;")
+      CUDA_ADD_LIBRARY(${package} STATIC ${src})
+    else()
+      ADD_LIBRARY(${package} STATIC ${src})
+    endif()
+  ELSE()
+    if ("${src}" MATCHES "cu$" OR "${src}" MATCHES "cu;")
+      CUDA_ADD_LIBRARY(${package} ${type} ${src})
+    else()
+      ADD_LIBRARY(${package} ${type} ${src})
+    endif()
+  ENDIF()
+ENDMACRO()
+ENDIF()
+
 #######################################################################
 ##### flags section
 ######################################################################
@@ -145,7 +166,6 @@ IF(C_AVX2_FOUND)
   SET(CMAKE_C_FLAGS "-DUSE_AVX2 ${CMAKE_C_FLAGS}")
 ENDIF(C_AVX2_FOUND)
 
-
 CHECK_C_SOURCE_RUNS("
 #include <stdatomic.h>
 int main()
@@ -230,11 +250,11 @@ IF(C_AVX2_FOUND)
 ENDIF(C_AVX2_FOUND)
 
 SET(hdr
-  THGeneral.h THHalf.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
+  THGeneral.h THHalf.h THAllocator.h THSize.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
   THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h )
 
 SET(src
-  THGeneral.c THHalf.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
+  THGeneral.c THHalf.c THAllocator.c THSize.c THStorage.c THTensor.c THBlas.c THLapack.c
   THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
 
 SET(src ${src} ${hdr} ${simd})
@@ -243,10 +263,15 @@ SET(src ${src} ${hdr} ${simd})
 ##### build section
 ######################################################################
 
-ADD_LIBRARY(TH SHARED ${src})
-if(BUILD_STATIC)
-  ADD_LIBRARY(TH_static STATIC ${src})
-endif()
+ADD_TORCH_LIBRARY(TH SHARED "${src}")
+
+IF (BUILD_STATIC OR "$ENV{STATIC_TH}" STREQUAL "YES")
+  ADD_TORCH_LIBRARY(TH_static STATIC "${src}")
+  SET_TARGET_PROPERTIES(TH_static PROPERTIES
+    COMPILE_FLAGS "-fPIC")
+  SET_TARGET_PROPERTIES(TH_static PROPERTIES
+    PREFIX "lib" IMPORT_PREFIX "lib" OUTPUT_NAME "TH")
+ENDIF()
 
 IF(NOT TH_SO_VERSION)
   SET(TH_SO_VERSION 0)
@@ -278,7 +303,16 @@ ENDIF()
 FIND_PACKAGE(BLAS)
 IF(BLAS_FOUND)
   SET(USE_BLAS 1)
-  TARGET_LINK_LIBRARIES(TH ${BLAS_LIBRARIES})
+  IF ($ENV{TH_BINARY_BUILD})
+    MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.")
+    TARGET_LINK_LIBRARIES(TH "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
+  ELSE ($ENV{TH_BINARY_BUILD})
+    TARGET_LINK_LIBRARIES(TH ${BLAS_LIBRARIES})
+  ENDIF ($ENV{TH_BINARY_BUILD})
+  
+  IF(BLAS_INFO STREQUAL "mkl")
+    ADD_DEFINITIONS(-DTH_BLAS_MKL)
+  ENDIF()
 ENDIF(BLAS_FOUND)
 
 FIND_PACKAGE(LAPACK)
@@ -373,6 +407,7 @@ INSTALL(FILES
   THLogAdd.h
   THMemoryFile.h
   THRandom.h
+  THSize.h
   THStorage.h
   THTensor.h
   THTensorApply.h
diff --git a/lib/TH/TH.h b/lib/TH/TH.h
index cdf331d..11f208c 100644
--- a/lib/TH/TH.h
+++ b/lib/TH/TH.h
@@ -12,6 +12,7 @@
 #include "THVector.h"
 #include "THLogAdd.h"
 #include "THRandom.h"
+#include "THSize.h"
 #include "THStorage.h"
 #include "THTensor.h"
 #include "THTensorApply.h"
diff --git a/lib/TH/THAllocator.c b/lib/TH/THAllocator.c
index 51ac69b..e69b3cc 100644
--- a/lib/TH/THAllocator.c
+++ b/lib/TH/THAllocator.c
@@ -105,6 +105,10 @@ void THMapAllocatorContext_free(THMapAllocatorContext *ctx)
 
 static void *_map_alloc(void* ctx_, ptrdiff_t size)
 {
+  if (size == 0) {
+    return NULL;
+  }
+
   THMapAllocatorContext *ctx = ctx_;
   void *data = NULL;
 
@@ -332,6 +336,9 @@ static void *THMapAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) {
 }
 
 static void THMapAllocator_free(void* ctx_, void* data) {
+  if (data == NULL)
+    return;
+
   THMapAllocatorContext *ctx = ctx_;
 
 #ifdef _WIN32
diff --git a/lib/TH/THDiskFile.c b/lib/TH/THDiskFile.c
index 01b1951..3f57b3b 100644
--- a/lib/TH/THDiskFile.c
+++ b/lib/TH/THDiskFile.c
@@ -3,6 +3,9 @@
 #include "THFilePrivate.h"
 
 #include <stdint.h>
+#ifndef LLONG_MAX
+#define LLONG_MAX 9223372036854775807LL
+#endif
 
 typedef struct THDiskFile__
 {
diff --git a/lib/TH/THGeneral.c b/lib/TH/THGeneral.c
index bb9bfc3..ac032b9 100644
--- a/lib/TH/THGeneral.c
+++ b/lib/TH/THGeneral.c
@@ -343,3 +343,41 @@ int THGetNumCores(void)
   return 1;
 #endif
 }
+
+#ifdef TH_BLAS_MKL
+extern int mkl_get_max_threads(void);
+#endif
+
+TH_API void THInferNumThreads(void)
+{
+#if defined(_OPENMP) && defined(TH_BLAS_MKL)
+  // If we are using MKL an OpenMP make sure the number of threads match.
+  // Otherwise, MKL and our OpenMP-enabled functions will keep changing the
+  // size of the OpenMP thread pool, resulting in worse performance (and memory
+  // leaks in GCC 5.4)
+  omp_set_num_threads(mkl_get_max_threads());
+#endif
+}
+
+TH_API THDescBuff _THSizeDesc(const long *size, const long ndim) {
+  const int L = TH_DESC_BUFF_LEN;
+  THDescBuff buf;
+  char *str = buf.str;
+  int n = 0;
+  n += snprintf(str, L-n, "[");
+  int i;
+  for(i = 0; i < ndim; i++) {
+    if(n >= L) break;
+    n += snprintf(str+n, L-n, "%ld", size[i]);
+    if(i < ndim-1) {
+      n += snprintf(str+n, L-n, " x ");
+    }
+  }
+  if(n < L - 2) {
+    snprintf(str+n, L-n, "]");
+  } else {
+    snprintf(str+L-5, 5, "...]");
+  }
+  return buf;
+}
+
diff --git a/lib/TH/THGeneral.h.in b/lib/TH/THGeneral.h.in
index de11f1b..0e1cb5d 100644
--- a/lib/TH/THGeneral.h.in
+++ b/lib/TH/THGeneral.h.in
@@ -14,6 +14,7 @@
 #cmakedefine USE_BLAS
 #cmakedefine USE_LAPACK
 #cmakedefine BLAS_F2C
+#cmakedefine BLAS_USE_CBLAS_DOT
 
 #ifdef __cplusplus
 # define TH_EXTERNC extern "C"
@@ -42,8 +43,14 @@
 typedef void (*THErrorHandlerFunction)(const char *msg, void *data);
 typedef void (*THArgErrorHandlerFunction)(int argNumber, const char *msg, void *data);
 
+#define TH_DESC_BUFF_LEN 64
+typedef struct {
+    char str[TH_DESC_BUFF_LEN];
+} THDescBuff;
+
 
 TH_API double THLog1p(const double x);
+TH_API THDescBuff _THSizeDesc(const long *size, const long ndim);
 TH_API void _THError(const char *file, const int line, const char *fmt, ...);
 TH_API void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...);
 TH_API void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data);
@@ -60,6 +67,7 @@ TH_API void THHeapUpdate(ptrdiff_t size);
 TH_API void THSetNumThreads(int num_threads);
 TH_API int THGetNumThreads(void);
 TH_API int THGetNumCores(void);
+TH_API void THInferNumThreads(void);
 
 #define THError(...) _THError(__FILE__, __LINE__, __VA_ARGS__)
 
diff --git a/lib/TH/THSize.c b/lib/TH/THSize.c
new file mode 100644
index 0000000..ccf1f61
--- /dev/null
+++ b/lib/TH/THSize.c
@@ -0,0 +1,26 @@
+#include "THSize.h"
+
+int THSize_isSameSizeAs(const long *sizeA, long dimsA, const long *sizeB, long dimsB) {
+  int d;
+  if (dimsA != dimsB)
+    return 0;
+  for(d = 0; d < dimsA; ++d)
+  {
+    if(sizeA[d] != sizeB[d])
+      return 0;
+  }
+  return 1;
+}
+
+ptrdiff_t THSize_nElement(long dims, long *size) {
+  if(dims == 0)
+    return 0;
+  else
+  {
+    ptrdiff_t nElement = 1;
+    int d;
+    for(d = 0; d < dims; d++)
+      nElement *= size[d];
+    return nElement;
+  }
+}
diff --git a/lib/TH/THSize.h b/lib/TH/THSize.h
new file mode 100644
index 0000000..3d39696
--- /dev/null
+++ b/lib/TH/THSize.h
@@ -0,0 +1,13 @@
+#ifndef TH_SIZE_INC
+#define TH_SIZE_INC
+
+#include "THGeneral.h"
+#include <stddef.h>
+
+// THTensor functions that would work on a THSize if we had such a class in C++,
+// i.e. THTensor functions that depend only on the shape of the tensor, not the type.
+
+TH_API int THSize_isSameSizeAs(const long *sizeA, long dimsA, const long *sizeB, long dimsB);
+TH_API ptrdiff_t THSize_nElement(long dims, long *size);
+
+#endif
diff --git a/lib/TH/THStorage.c b/lib/TH/THStorage.c
index cea0f95..f6b63f4 100644
--- a/lib/TH/THStorage.c
+++ b/lib/TH/THStorage.c
@@ -15,28 +15,10 @@
 
 
 THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) {
-  const int L = TH_DESC_BUFF_LEN;
-  THDescBuff buf;
-  char *str = buf.str;
-  int n = 0;
-  n += snprintf(str, L-n, "[");
-  int i;
-  for(i = 0; i < size->size; i++) {
-    if(n >= L) break;
-    n += snprintf(str+n, L-n, "%ld", size->data[i]);
-    if(i < size->size-1) {
-      n += snprintf(str+n, L-n, " x ");
-    }
-  }
-  if(n < L - 2) {
-    snprintf(str+n, L-n, "]");
-  } else {
-    snprintf(str+L-5, 5, "...]");
-  }
-  return buf;
+  return _THSizeDesc(size->data, size->size);
 }
 
-TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement)
+THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement)
 {
   ptrdiff_t total_size = (size->size > 0 ? 1 : 0);
   ptrdiff_t dim_infer = -1;
@@ -66,39 +48,106 @@ TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t
   return copy;
 }
 
-TH_API void THLongStorage_calculateExpandGeometry(long *tensorSizes, long *tensorStrides, long tensorDim, THLongStorage *sizes, long **esz, long **est) {
-  ptrdiff_t ndim = THLongStorage_size(sizes);
-  long numUnsqueezed = ndim - tensorDim;
+int THLongStorage_inferSize2(THLongStorage *output, long *sizesA, long dimsA, long *sizesB, long dimsB,
+                             char *error_buffer, int buffer_len) {
+  THArgCheck(sizesA != NULL, 1, "sizesA must not be null");
+  THArgCheck(sizesB != NULL, 2, "sizesB must not be null");
+  THArgCheck(dimsA, 1, "Can't expand empty tensor a");
+  THArgCheck(dimsB, 1, "Can't expand empty tensor b");
+  ptrdiff_t ndim = dimsA > dimsB ? dimsA : dimsB;
 
   long *expandedSizes = THAlloc(sizeof(long)*ndim);
-  long *expandedStrides = THAlloc(sizeof(long)*ndim);
 
-  for (long i = numUnsqueezed; i < ndim; ++i) {
-    expandedSizes[i] = tensorSizes[i - numUnsqueezed];
-    expandedStrides[i] = tensorStrides[i - numUnsqueezed];
+  for (long i = ndim - 1; i >= 0; --i) {
+    long offset = ndim - 1 - i;
+    long dimA = dimsA - 1 - offset;
+    long dimB = dimsB - 1 - offset;
+    long sizeA = (dimA >= 0) ? sizesA[dimA] : 1;
+    long sizeB = (dimB >= 0) ? sizesB[dimB] : 1;
+    if (sizeA == sizeB || sizeA == 1 || sizeB == 1) {
+      expandedSizes[i] = THMax(sizeA, sizeB);
+    } else {
+      THFree(expandedSizes);
+      snprintf(error_buffer, buffer_len, "The size of tensor a (%ld) must match the size of tensor b (%ld) at "
+               "non-singleton dimension %ld.", sizeA, sizeB, i);
+      return -1;
+    }
   }
+  THLongStorage_resize(output, ndim);
+  memcpy(THLongStorage_data(output), expandedSizes, sizeof(long)*ndim);
+  THFree(expandedSizes);
+  return 0;
+}
 
-  for (long i = numUnsqueezed - 1; i > -1; --i) {
-    expandedSizes[i] = 1;
-    expandedStrides[i] = expandedSizes[i+1] * expandedStrides[i+1];
+int THLongStorage_inferSizeN(THLongStorage *output, int n, long **sizes, long *dims,
+                             char *error_buffer, int buffer_len) {
+  THArgCheck(n > 0, 2, "n must be greater than 0");
+  THArgCheck(sizes != NULL, 1, "sizes must not be null");
+  THArgCheck(dims != NULL, 1, "dims must not be null");
+
+  ptrdiff_t ndim = 0;
+  for (int j = 0; j < n; ++j) {
+    THArgCheck(sizes[ j ] != NULL, 1, "size %d must not be null", j);
+    THArgCheck(dims[ j ], 1, "Can't expand empty tensor %d", j);
+    ndim = dims[ j ] > ndim ? dims[ j ] : ndim;
   }
 
-  // create a new geometry for the tensor
-  for (long i = 0; i < ndim; ++i) {
-    long size = expandedSizes[i];
+  long *expandedSizes = THAlloc(sizeof(long)*ndim);
+
+  for (long i = ndim - 1; i >= 0; --i) {
+    expandedSizes[ i ] = 1;
+    long offset = ndim - 1 - i;
+    for (int j  = 0; j < n; ++j) {
+      long dim = dims[ j ] - 1 - offset;
+      long size = (dim >= 0) ? sizes[ j ][ dim ] : 1;
+      if (size == expandedSizes[ i ] || size == 1 || expandedSizes[ i ] == 1) {
+        expandedSizes[ i ] =  THMax(expandedSizes[ i ], size);
+      } else {
+        THFree(expandedSizes);
+        snprintf(error_buffer, buffer_len, "The size of tensor %i (%ld) must match the expanded size"
+                 "of tensor (%ld) at non-singleton dimension %ld.", j, size, expandedSizes[ i ], i);
+        return -1;
+      }
+    }
+  }
+  THLongStorage_resize(output, ndim);
+  memcpy(THLongStorage_data(output), expandedSizes, sizeof(long)*ndim);
+  THFree(expandedSizes);
+  return 0;
+}
+
+int THLongStorage_inferExpandGeometry(long *tensorSizes, long *tensorStrides, long tensorDim,
+                                        THLongStorage *sizes, long **expandedSizes, long **expandedStrides,
+                                        char *error_buffer, int buffer_len) {
+  ptrdiff_t ndim = THLongStorage_size(sizes);
+
+  long *expandedSizesCalc = THAlloc(sizeof(long)*ndim);
+  long *expandedStridesCalc = THAlloc(sizeof(long)*ndim);
+
+  // create a new geometry for the tensors
+  for (long i = ndim - 1; i >= 0; --i) {
+    long offset = ndim - 1 - i;
+    long dim = tensorDim - 1 - offset;
+    long size = (dim >= 0) ? tensorSizes[dim] : 1;
+    long stride = (dim >= 0) ?
+        tensorStrides[dim] : expandedSizesCalc[i + 1] * expandedStridesCalc[i+1];
     long targetSize = THLongStorage_data(sizes)[i];
-    if (size == 1) {
-      if (targetSize != 1) {
-        expandedSizes[i] = targetSize;
-        expandedStrides[i] = 0;
+    if (size != targetSize) {
+      if (size == 1) {
+        size = targetSize;
+        stride = 0;
+      } else {
+        THFree(expandedSizesCalc);
+        THFree(expandedStridesCalc);
+        snprintf(error_buffer, buffer_len, "The expanded size of the tensor (%ld) must match the existing size (%ld) at "
+                 "non-singleton dimension %ld.", targetSize, size, i);
+        return -1;
       }
-    } else if (size != targetSize) {
-      THFree(expandedSizes);
-      THFree(expandedStrides);
-      THError("The expanded size of the tensor (%d) must match the existing size (%d) at \
-              non-singleton dimension %ld.", targetSize, size, i);
     }
+    expandedSizesCalc[i] = size;
+    expandedStridesCalc[i] = stride;
   }
-  *esz = expandedSizes;
-  *est = expandedStrides;
+  *expandedSizes = expandedSizesCalc;
+  *expandedStrides = expandedStridesCalc;
+  return 0;
 }
diff --git a/lib/TH/THStorage.h b/lib/TH/THStorage.h
index f81926c..fb7946b 100644
--- a/lib/TH/THStorage.h
+++ b/lib/TH/THStorage.h
@@ -7,11 +7,6 @@
 #define THStorage        TH_CONCAT_3(TH,Real,Storage)
 #define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
 
-#define TH_DESC_BUFF_LEN 64
-typedef struct {
-    char str[TH_DESC_BUFF_LEN];
-} THDescBuff;
-
 /* fast access methods */
 #define TH_STORAGE_GET(storage, idx) ((storage)->data[(idx)])
 #define TH_STORAGE_SET(storage, idx, value) ((storage)->data[(idx)] = (value))
@@ -30,6 +25,15 @@ typedef struct {
 
 TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size);
 TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement);
-TH_API void THLongStorage_calculateExpandGeometry(long *tensorSizes, long *tensorStrides, long tensorDim, THLongStorage *sizes, long **esz, long **est);
+
+// Given the sizes of {2,N} tensors, write out the size when the tensors are expanded together.
+TH_API int THLongStorage_inferSize2(THLongStorage *output, long *sizesA, long dimsA,
+                                    long *sizesB, long dimsB, char *error_buffer, int buffer_len);
+TH_API int THLongStorage_inferSizeN(THLongStorage *output, int n, long **sizes, long *dims,
+                                    char *error_buffer, int buffer_len);
+
+TH_API int THLongStorage_inferExpandGeometry(long *tensorSizes, long *tensorStrides, long tensorDim,
+                                             THLongStorage *sizes, long **expandedSizes, long **expandedStrides,
+                                             char *error_buffer, int buffer_len);
 
 #endif
diff --git a/lib/TH/THTensorApply.h b/lib/TH/THTensorApply.h
index 86672b9..7f48da4 100644
--- a/lib/TH/THTensorApply.h
+++ b/lib/TH/THTensorApply.h
@@ -141,10 +141,24 @@
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, DIM, 1) \
-\
-  if(TENSOR1##_n != TENSOR2##_n || TENSOR1##_n != TENSOR3##_n) /* should we do the check in the function instead? i think so */ \
-    THError("inconsistent tensor size"); \
-\
+                                                                        \
+  int elements_equal = 1;                                               \
+  if(TENSOR1##_n != TENSOR2##_n) {                                      \
+    elements_equal = 0;                                                 \
+  }                                                                     \
+  else if(TENSOR1##_n != TENSOR3##_n) {                                 \
+    elements_equal = 0;                                                 \
+  }                                                                     \
+  if (elements_equal == 0) {                                            \
+    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
+    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
+    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->nDimension); \
+    THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \
+            "number of elements, but got %d, %d and %d elements respectively", \
+            #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, \
+            TENSOR1##_n, TENSOR2##_n, TENSOR3##_n);                     \
+  }                                                                     \
+                                                                        \
   while(!TH_TENSOR_APPLY_hasFinished) \
   { \
     /* Loop through the inner most region of the Tensor */ \
@@ -174,9 +188,13 @@
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
 \
-  if(TENSOR1##_n != TENSOR2##_n) /* should we do the check in the function instead? i think so */ \
-    THError("inconsistent tensor size"); \
-\
+    if(TENSOR1##_n != TENSOR2##_n) {                                    \
+      THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
+      THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
+      THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \
+              "number of elements, but got %d and %d elements respectively", \
+              #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, TENSOR1##_n, TENSOR2##_n); \
+    }                                                                   \
   while(!TH_TENSOR_APPLY_hasFinished) \
   { \
     /* Loop through the inner most region of the Tensor */ \
diff --git a/lib/TH/THTensorDimApply.h b/lib/TH/THTensorDimApply.h
index df333fa..6727e1f 100644
--- a/lib/TH/THTensorDimApply.h
+++ b/lib/TH/THTensorDimApply.h
@@ -14,19 +14,38 @@
   int TH_TENSOR_DIM_APPLY_i; \
 \
   if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->nDimension) ) \
-    THError("invalid dimension"); \
-  if( TENSOR1->nDimension != TENSOR2->nDimension ) \
-    THError("inconsistent tensor sizes"); \
-  if( TENSOR1->nDimension != TENSOR3->nDimension ) \
-    THError("inconsistent tensor sizes"); \
+    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->nDimension); \
+  int same_dims = 1;                                                    \
+  if( TENSOR1->nDimension != TENSOR2->nDimension ) {                    \
+    same_dims = 0;                                                      \
+  } \
+  if( TENSOR1->nDimension != TENSOR3->nDimension ) { \
+    same_dims = 0;                                   \
+  } \
+  if (same_dims == 0) { \
+    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
+    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
+    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->nDimension); \
+    THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \
+            "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str); \
+  }                                                                     \
+  int shape_check_flag = 0;                                             \
   for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
   { \
     if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       continue; \
     if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) \
-      THError("inconsistent tensor sizes"); \
+      shape_check_flag = 1;                                             \
     if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR3->size[TH_TENSOR_DIM_APPLY_i]) \
-      THError("inconsistent tensor sizes"); \
+      shape_check_flag = 1;                                             \
+  } \
+    \
+  if (shape_check_flag == 1) { \
+    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
+    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
+    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->nDimension); \
+    THError("Expected %s %s, %s %s and %s %s to have the same size in dimension %d", \
+            #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, DIMENSION); \
   } \
 \
   TH_TENSOR_DIM_APPLY_counter = (long*)THAlloc(sizeof(long)*(TENSOR1->nDimension)); \
@@ -119,15 +138,24 @@
   int TH_TENSOR_DIM_APPLY_i; \
 \
   if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->nDimension) ) \
-    THError("invalid dimension"); \
-  if( TENSOR1->nDimension != TENSOR2->nDimension ) \
-    THError("inconsistent tensor sizes"); \
+    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->nDimension); \
+  if( TENSOR1->nDimension != TENSOR2->nDimension ) {                    \
+    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
+    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
+    THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \
+            "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str);        \
+  }                                                                     \
+  int shape_check_flag = 0;                                             \
   for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
   { \
     if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       continue; \
-    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) \
-      THError("inconsistent tensor sizes"); \
+    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \
+      THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
+      THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
+      THError("Expected %s %s and %s %s to have the same size in dimension %d", \
+              #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, DIMENSION);   \
+    }                                                                   \
   } \
 \
   TH_TENSOR_DIM_APPLY_counter = (long*)THAlloc(sizeof(long)*(TENSOR1->nDimension)); \
diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index 2188fc7..a62cfad 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -274,6 +274,22 @@ int main() {
   ELSE (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
     SET(BLAS_F2C FALSE)
   ENDIF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+  CHECK_C_SOURCE_RUNS("
+#include <stdlib.h>
+#include <stdio.h>
+float x[4] = { 1, 2, 3, 4 };
+float y[4] = { .1, .01, .001, .0001 };
+extern float cblas_sdot();
+int main() {
+  int i;
+  double r = cblas_sdot(4, x, 1, y, 1);
+  exit((float)r != (float).1234);
+}" BLAS_USE_CBLAS_DOT )
+  IF (BLAS_USE_CBLAS_DOT)
+    SET(BLAS_USE_CBLAS_DOT TRUE)
+  ELSE (BLAS_USE_CBLAS_DOT)
+    SET(BLAS_USE_CBLAS_DOT FALSE)
+  ENDIF (BLAS_USE_CBLAS_DOT)
 ENDIF(BLAS_LIBRARIES)
 
 # epilogue
diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 7c9325a..88f0aa3 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -50,7 +50,7 @@ ENDIF ("${SIZE_OF_VOIDP}" EQUAL 8)
 IF(CMAKE_COMPILER_IS_GNUCC)
   SET(mklthreads "mkl_gnu_thread" "mkl_intel_thread")
   SET(mklifaces  "gf" "intel")
-  SET(mklrtls "iomp5")
+  SET(mklrtls "gomp" "iomp5")
 ELSE(CMAKE_COMPILER_IS_GNUCC)
   SET(mklthreads "mkl_intel_thread")
   SET(mklifaces  "intel")
@@ -92,7 +92,7 @@ ENDIF (INTEL_MKL_DIR)
 # Try linking multiple libs
 MACRO(CHECK_ALL_LIBRARIES LIBRARIES _name _list _flags)
   # This macro checks for the existence of the combination of libraries given by _list.
-  # If the combination is found, this macro whether we can link against that library
+  # If the combination is found, this macro checks whether we can link against that library
   # combination using the name of a routine given by _name using the linker
   # flags given by _flags.  If the combination of libraries is found and passes
   # the link test, LIBRARIES is set to the list of complete library paths that
@@ -116,8 +116,15 @@ MACRO(CHECK_ALL_LIBRARIES LIBRARIES _name _list _flags)
   message(STATUS "Checking for [${__list}]")
   FOREACH(_library ${_list})
     SET(_combined_name ${_combined_name}_${_library})
-    IF(_libraries_work)      
-      FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${_library})
+    IF(_libraries_work)
+      IF(${_library} STREQUAL "gomp")
+          FIND_PACKAGE(OpenMP)
+          IF(OPENMP_FOUND)
+	      SET(${_prefix}_${_library}_LIBRARY ${OpenMP_C_FLAGS})
+          ENDIF(OPENMP_FOUND)
+      ELSE(${_library} STREQUAL "gomp")
+          FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${_library})
+      ENDIF(${_library} STREQUAL "gomp")
       MARK_AS_ADVANCED(${_prefix}_${_library}_LIBRARY)
       SET(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
       SET(_libraries_work ${${_prefix}_${_library}_LIBRARY})
@@ -131,6 +138,7 @@ MACRO(CHECK_ALL_LIBRARIES LIBRARIES _name _list _flags)
   # Test this combination of libraries.
   IF(_libraries_work)
     SET(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}})
+    SET(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES};${CMAKE_REQUIRED_LIBRARIES}")
     CHECK_FUNCTION_EXISTS(${_name} ${_prefix}${_combined_name}_WORKS)
     SET(CMAKE_REQUIRED_LIBRARIES)
     MARK_AS_ADVANCED(${_prefix}${_combined_name}_WORKS)
@@ -150,6 +158,11 @@ else(WIN32)
   set(mkl_m "m")
 endif(WIN32)
 
+if(UNIX AND NOT APPLE)
+  set(mkl_dl "${CMAKE_DL_LIBS}")
+else(UNIX AND NOT APPLE)
+  set(mkl_dl "")
+endif(UNIX AND NOT APPLE)
 
 # Check for version 10/11
 IF (NOT MKL_LIBRARIES)
@@ -161,7 +174,7 @@ FOREACH(mklrtl ${mklrtls} "")
       FOREACH(mklthread ${mklthreads})
         IF (NOT MKL_LIBRARIES AND NOT INTEL_MKL_SEQUENTIAL)
           CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
-            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m}" "")
+            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m};${mkl_dl}" "")
         ENDIF (NOT MKL_LIBRARIES AND NOT INTEL_MKL_SEQUENTIAL)          
       ENDFOREACH(mklthread)
     ENDFOREACH(mkl64)
@@ -172,7 +185,7 @@ FOREACH(mklrtl ${mklrtls} "")
     FOREACH(mkl64 ${mkl64s} "")
       IF (NOT MKL_LIBRARIES)
         CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
-          "mkl_${mkliface}${mkl64};mkl_sequential;mkl_core;${mkl_m}" "")
+          "mkl_${mkliface}${mkl64};mkl_sequential;mkl_core;${mkl_m};${mkl_dl}" "")
         IF (MKL_LIBRARIES)
           SET(mklseq "_sequential")
         ENDIF (MKL_LIBRARIES)
@@ -186,7 +199,7 @@ FOREACH(mklrtl ${mklrtls} "")
       FOREACH(mklthread ${mklthreads})
         IF (NOT MKL_LIBRARIES)
           CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
-            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m}" "")
+            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m};${mkl_dl}" "")
         ENDIF (NOT MKL_LIBRARIES)          
       ENDFOREACH(mklthread)
     ENDFOREACH(mkl64)
diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index b04931f..bcd2a65 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -18,7 +18,18 @@ TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy);
 TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy);
 TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy);
 TH_EXTERNC double ddot_(int *n, double *x, int *incx, double *y, int *incy);
+#ifdef BLAS_USE_CBLAS_DOT
+TH_EXTERNC float cblas_sdot(const int n, const float *x, const int incx, const float *y, const int incy);
+#ifndef THBlas_C_sdot_
+#define THBlas_C_sdot_
+inline ffloat sdot_(const int *n, const float *x, const int *incx, const float *y, const int *incy)
+{
+  return cblas_sdot(*n, x, *incx, y, *incy);
+}
+#endif
+#else
 TH_EXTERNC ffloat sdot_(int *n, float *x, int *incx, float *y, int *incy);
+#endif
 TH_EXTERNC void dgemv_(char *trans, int *m, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
 TH_EXTERNC void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
 TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda);
diff --git a/lib/TH/generic/THTensor.c b/lib/TH/generic/THTensor.c
index eab7231..8ebec67 100644
--- a/lib/TH/generic/THTensor.c
+++ b/lib/TH/generic/THTensor.c
@@ -286,20 +286,73 @@ void THTensor_(resize5d)(THTensor *self, long size0, long size1, long size2, lon
 }
 
 THTensor* THTensor_(newExpand)(THTensor *tensor, THLongStorage *sizes) {
-  THArgCheck(THLongStorage_size(sizes) >= THTensor_(nDimension)(tensor), 1, "the number of sizes provided \
-      must be greater or equal to the number of dimensions in the tensor");
+  THTensor *result = THTensor_(new)();
+  THTensor_(expand)(result, tensor, sizes);
+  return result;
+}
+
+void THTensor_(expand)(THTensor *r, THTensor *tensor, THLongStorage *sizes) {
   THArgCheck(THTensor_(nDimension)(tensor) > 0, 0, "can't expand an empty tensor");
+  THArgCheck(THLongStorage_size(sizes) >= THTensor_(nDimension)(tensor), 1,
+             "the number of sizes provided must be greater or equal to the "
+             "number of dimensions in the tensor");
 
   long *expandedSizes;
   long *expandedStrides;
-  THLongStorage_calculateExpandGeometry(tensor->size, tensor->stride, THTensor_(nDimension)(tensor), sizes, &expandedSizes, &expandedStrides);
+  char error_buffer[1024];
+  int ret =
+      THLongStorage_inferExpandGeometry(tensor->size, tensor->stride, THTensor_(nDimension)(tensor),
+                                        sizes, &expandedSizes, &expandedStrides, error_buffer, 1024);
 
-  THTensor *result = THTensor_(new)();
-  THTensor_(setStorageNd)(result, THTensor_(storage)(tensor), THTensor_(storageOffset)(tensor), THLongStorage_size(sizes), expandedSizes, expandedStrides);
+  if (ret != 0) {
+    THError(error_buffer);
+    return;
+  }
+
+  THTensor_(setStorageNd)(r, THTensor_(storage)(tensor), THTensor_(storageOffset)(tensor),
+                          THLongStorage_size(sizes), expandedSizes, expandedStrides);
   THFree(expandedSizes);
   THFree(expandedStrides);
+}
 
-  return result;
+
+void THTensor_(expandNd)(THTensor **rets, THTensor **ops, int count) {
+  for (int i = 0; i < count; ++i) {
+    THArgCheck(THTensor_(nDimension)(ops[i]) > 0, i, "can't expand empty tensor %d", i);
+  }
+
+  long **op_sizes = THAlloc(sizeof(long*) * count);
+  long *op_dims = THAlloc(sizeof(long) * count);
+
+  for (int i = 0; i < count; ++i) {
+    op_sizes[i] = ops[i]->size;
+    op_dims[i] = ops[i]->nDimension;
+  }
+
+  THLongStorage *sizes = THLongStorage_new();
+  char error_buffer[1024];
+  int ret = THLongStorage_inferSizeN(sizes,
+                                     count,
+                                     op_sizes,
+                                     op_dims,
+                                     error_buffer,
+                                     1024);
+
+  if(ret != 0) {
+    THFree(op_sizes);
+    THFree(op_dims);
+    THLongStorage_free(sizes);
+    THError(error_buffer);
+    return;
+  }
+
+  for (int i = 0; i < count; ++i) {
+    THTensor_(expand)(rets[i], ops[i], sizes);
+  }
+
+  THFree(op_sizes);
+  THFree(op_dims);
+  THLongStorage_free(sizes);
 }
 
 void THTensor_(set)(THTensor *self, THTensor *src)
diff --git a/lib/TH/generic/THTensor.h b/lib/TH/generic/THTensor.h
index 8754796..9fb246c 100644
--- a/lib/TH/generic/THTensor.h
+++ b/lib/TH/generic/THTensor.h
@@ -71,6 +71,9 @@ TH_API THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long siz
 TH_API THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size);
 TH_API THTensor *THTensor_(newExpand)(THTensor *tensor, THLongStorage *size);
 
+TH_API void THTensor_(expand)(THTensor *r, THTensor *tensor, THLongStorage *size);
+TH_API void THTensor_(expandNd)(THTensor **rets, THTensor **ops, int count);
+
 TH_API void THTensor_(resize)(THTensor *tensor, THLongStorage *size, THLongStorage *stride);
 TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src);
 TH_API void THTensor_(resizeNd)(THTensor *tensor, int nDimension, long *size, long *stride);
diff --git a/lib/TH/generic/THTensorCopy.c b/lib/TH/generic/THTensorCopy.c
index e909728..d9cd1c0 100644
--- a/lib/TH/generic/THTensorCopy.c
+++ b/lib/TH/generic/THTensorCopy.c
@@ -2,17 +2,86 @@
 #define TH_GENERIC_FILE "generic/THTensorCopy.c"
 #else
 
+int THTensor_(copyTransposeValid)(THTensor *tensor, THTensor *src) {
+  const int MIN_SZ = 60 * 60;
+  return THTensor_(isContiguous)(tensor) &&
+         THTensor_(nDimension)(src) == 2 &&
+         THTensor_(stride)(src, 0) == 1 &&
+         THTensor_(stride)(src, 1) == THTensor_(size)(src, 0) &&
+         THTensor_(nElement)(tensor) >= MIN_SZ;
+}
+
+// special case copy where tensor is contiguous and src is a transposed matrix
+// This can be generalized to most copies, but it's tricker
+void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
+  #define MIN(x, y) (((x) < (y)) ? (x) : (y))
+  #define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#ifdef TH_REAL_IS_BYTE
+  const int BLOCK_SZ = 120;
+#else
+  const int BLOCK_SZ = 60;
+#endif
+
+  THTensor *buf = THTensor_(newWithSize2d)(BLOCK_SZ, BLOCK_SZ);
+  real *sp = THTensor_(data)(src);
+  real *rp = THTensor_(data)(tensor);
+  real *bp = THTensor_(data)(buf);
+
+  long NR = THTensor_(size)(src, 0);
+  long NC = THTensor_(size)(src, 1);
+  for (long R = 0; R < NR; R += BLOCK_SZ) {
+    for (long C = 0; C < NC; C += BLOCK_SZ) {
+      real *spo = sp + R + C * NR;
+      real *rpo = rp + C + R * NC;
+
+      int nr = MIN(NR - R, BLOCK_SZ);
+      int nc = MIN(NC - C, BLOCK_SZ);
+
+      // 1. copy columns from src to buf
+      for (int c = 0; c < nc; c++) {
+        memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(real));
+      }
+
+      // 2. transpose buf in place
+      int rc_max = MAX(nr, nc);
+      int rc_min = MIN(nr, nc);
+      for (int r = 0; r < rc_max; r++) {
+        int end = MIN(r, rc_min);
+        for (int c = 0; c < end; c++) {
+          real tmp = bp[r + BLOCK_SZ * c];
+          bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
+          bp[r * BLOCK_SZ + c] = tmp;
+        }
+      }
+
+      // 3. copy rows from buf to dst
+      for (int r = 0; r < nr; r++) {
+        memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(real));
+      }
+    }
+  }
+  THTensor_(free)(buf);
+  #undef MIN
+  #undef MAX
+}
+
 void THTensor_(copy)(THTensor *tensor, THTensor *src)
 {
+  if (tensor == src) return;
   if (THTensor_(isContiguous)(tensor) && THTensor_(isContiguous)(src) && THTensor_(nElement)(tensor) == THTensor_(nElement)(src)) {
     real *sp = THTensor_(data)(src);
     real *rp = THTensor_(data)(tensor);
     ptrdiff_t sz = THTensor_(nElement)(tensor);
 #ifndef TH_REAL_IS_HALF
-    THVector_(copy)(rp, sp, sz); 
+    THVector_(copy)(rp, sp, sz);
 #else
     memcpy(rp, sp, sz * sizeof(real));
 #endif
+#ifndef TH_REAL_IS_HALF
+  } else if (THTensor_(copyTransposeValid)(tensor, src)) {
+    THTensor_(copyTranspose)(tensor, src);
+#endif
   } else {
     TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;)
   }
diff --git a/lib/TH/generic/THTensorLapack.c b/lib/TH/generic/THTensorLapack.c
index d0196c9..d4e52f6 100644
--- a/lib/TH/generic/THTensorLapack.c
+++ b/lib/TH/generic/THTensorLapack.c
@@ -938,9 +938,12 @@ void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, co
   THTensor_(free)(work);
 }
 
-void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, THTensor *a)
+void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, int pivot, THTensor *a)
 {
   THArgCheck(THTensor_(nDimension)(a) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(a));
+  if (!pivot) {
+    THError("btrifact without pivoting is not implemented on the CPU");
+  }
 
   if (ra_ != a) {
     THTensor_(resizeAs)(ra_, a);
diff --git a/lib/TH/generic/THTensorLapack.h b/lib/TH/generic/THTensorLapack.h
index 29db83b..8785943 100644
--- a/lib/TH/generic/THTensorLapack.h
+++ b/lib/TH/generic/THTensorLapack.h
@@ -19,7 +19,7 @@ TH_API void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau);
 TH_API void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans);
 TH_API void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor*a, const char* uplo, real tol);
 
-TH_API void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, THTensor *a);
+TH_API void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, int pivot, THTensor *a);
 TH_API void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots);
 
 #endif
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index 1dc1bc7..ac099cf 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -2,6 +2,10 @@
 #define TH_GENERIC_FILE "generic/THTensorMath.c"
 #else
 
+#ifndef NAN
+  #define NAN (nan(NULL))
+#endif
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -466,6 +470,31 @@ void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor
                        })
 }
 
+void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
+{
+  long elems_per_row, i, idx;
+
+  THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4,
+             "Input tensor must have same dimensions as output tensor");
+
+  elems_per_row = THLongTensor_size(index, dim);
+
+  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, long, index, dim,
+                       for (i = 0; i < elems_per_row; ++i)
+                       {
+                         idx = *(index_data + i*index_stride);
+                         if (idx < TH_INDEX_BASE || idx >= tensor_size + TH_INDEX_BASE)
+                         {
+                           THFree(TH_TENSOR_DIM_APPLY_counter);
+                           THError("Invalid index in scatterAdd");
+                         }
+                         tensor_data[(idx - TH_INDEX_BASE) * tensor_stride] += *(src_data + i*src_stride);
+                       })
+}
+
 void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val)
 {
   long elems_per_row, i, idx;
@@ -557,6 +586,33 @@ real THTensor_(maxall)(THTensor *tensor)
   return theMax;
 }
 
+static void THTensor_(quickselectnoidx)(real *arr, long k, long elements, long stride);
+
+real THTensor_(medianall)(THTensor *tensor)
+{
+  THArgCheck(tensor->nDimension > 0, 1, "tensor must have one dimension");
+
+  real theMedian;
+  ptrdiff_t numel;
+  long k;
+  THTensor *temp_;
+  real *temp__data;
+
+  numel = THTensor_(nElement)(tensor);
+  k = (numel-1) >> 1;
+
+  temp_ = THTensor_(newClone)(tensor);
+  temp__data = THTensor_(data)(temp_);
+
+  THTensor_(quickselectnoidx)(temp__data, k, numel, 1);
+
+  theMedian = temp__data[k];
+
+  THTensor_(free)(temp_);
+
+  return theMedian;
+}
+
 accreal THTensor_(sumall)(THTensor *tensor)
 {
   accreal sum = 0;
@@ -1250,7 +1306,9 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
   if(t != r_)
   {
     THTensor_(resizeAs)(r_, t);
-    THTensor_(copy)(r_, t);
+    if (beta != 0.0) {
+      THTensor_(copy)(r_, t);
+    }
   }
 
   /* r_ */
@@ -1317,6 +1375,7 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
     m2_ = THTensor_(newContiguous)(m2);
   }
 
+#pragma omp critical(blasgemm)
   /* do the operation */
   THBlas_(gemm)(transpose_m1,
                 transpose_m2,
@@ -1419,7 +1478,9 @@ void THTensor_(addbmm)(THTensor *result, real beta, THTensor *t, real alpha, THT
 
   if (t != result) {
     THTensor_(resizeAs)(result, t);
-    THTensor_(copy)(result, t);
+    if (beta != 0.0) {
+      THTensor_(copy)(result, t);
+    }
   }
 
   THTensor *matrix1 = THTensor_(new)();
@@ -1460,7 +1521,9 @@ void THTensor_(baddbmm)(THTensor *result, real beta, THTensor *t, real alpha, TH
 
   if (t != result) {
     THTensor_(resizeAs)(result, t);
-    THTensor_(copy)(result, t);
+    if (beta != 0.0) {
+      THTensor_(copy)(result, t);
+    }
   }
 
   THTensor *matrix1 = THTensor_(new)();
@@ -1950,6 +2013,17 @@ void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step)
   TH_TENSOR_APPLY(real, r_, *r__data = xmin + (i++)*step;);
 }
 
+void THTensor_(arange)(THTensor *r_, accreal xmin, accreal xmax, accreal step) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+  int m = fmod(xmax - xmin,step) == 0;
+#else
+  int m = (xmax - xmin) % step == 0;
+#endif
+  if (m)
+    xmax -= step;
+  THTensor_(range)(r_,xmin,xmax,step);
+}
+
 void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, long n)
 {
   real *r__data;
@@ -2003,6 +2077,9 @@ void THTensor_(reshape)(THTensor *r_, THTensor *t, THLongStorage *size)
 #define LONG_SWAP(AAA, BBB) swap = AAA; AAA = BBB; BBB = swap
 #define REAL_SWAP(AAA, BBB) rswap = AAA; AAA = BBB; BBB = rswap
 
+#define ARR_SWAP(III, JJJ) \
+  REAL_SWAP(ARR(III), ARR(JJJ));
+
 #define BOTH_SWAP(III, JJJ) \
   REAL_SWAP(ARR(III), ARR(JJJ)); \
   LONG_SWAP(IDX(III), IDX(JJJ))
@@ -2222,6 +2299,53 @@ void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimensio
 
 /* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
 public domain implementation at http://ndevilla.free.fr/median/median/
+Adapted similarly to the above Quicksort algorithm.
+This version does not produce indices along with values. */
+static void THTensor_(quickselectnoidx)(real *arr, long k, long elements, long stride)
+{
+  long P, L, R, i, j, swap;
+  real rswap, piv;
+  L = 0;
+  R = elements-1;
+
+  do {
+    if (R <= L) /* One element only */
+      return;
+
+    if (R == L+1) {  /* Two elements only */
+      if (ARR(L) > ARR(R)) {
+        ARR_SWAP(L, R);
+      }
+      return;
+    }
+
+    /* Use median of three for pivot choice */
+    P=(L+R)>>1;
+    ARR_SWAP(P, L+1);
+    if (ARR(L+1) > ARR(R)) { ARR_SWAP(L+1, R); }
+    if (ARR(L) > ARR(R)) { ARR_SWAP(L, R); }
+    if (ARR(L+1) > ARR(L)) { ARR_SWAP(L+1, L); }
+
+    i = L+1;
+    j = R;
+    piv = ARR(L);
+    do {
+      do i++; while(ARR(i) < piv);
+      do j--; while(ARR(j) > piv);
+      if (j < i)
+        break;
+      ARR_SWAP(i, j);
+    } while(1);
+    ARR_SWAP(L, j);
+
+    /* Re-set active partition */
+    if (j <= k) L=i;
+    if (j >= k) R=j-1;
+  } while(1);
+}
+
+/* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
+public domain implementation at http://ndevilla.free.fr/median/median/
 Adapted similarly to the above Quicksort algorithm. */
 static void THTensor_(quickselect)(real *arr, long *idx, long k, long elements, long stride)
 {
@@ -2801,7 +2925,7 @@ void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim)
   THTensor_(div)(r_, r_, t->size[dimension]);
 }
 
-void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim)
+void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim)
 {
   THLongStorage *dim;
 
@@ -2824,7 +2948,7 @@ void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keep
                          sum2 += z*z;
                        }
 
-                       if(flag)
+                       if(biased)
                        {
                          sum /= t_size;
                          sum2 /= t_size;
@@ -2846,7 +2970,7 @@ void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keep
   }
 }
 
-void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim)
+void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim)
 {
   THLongStorage *dim;
 
@@ -2869,7 +2993,7 @@ void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag, int keep
                          sum2 += z*z;
                        }
 
-                       if(flag)
+                       if(biased)
                        {
                          sum /= t_size;
                          sum2 /= t_size;
@@ -3009,18 +3133,18 @@ accreal THTensor_(meanall)(THTensor *tensor)
   return THTensor_(sumall)(tensor)/THTensor_(nElement)(tensor);
 }
 
-accreal THTensor_(varall)(THTensor *tensor)
+accreal THTensor_(varall)(THTensor *tensor, int biased)
 {
   accreal mean = THTensor_(meanall)(tensor);
   accreal sum = 0;
   TH_TENSOR_APPLY(real, tensor, sum += (*tensor_data - mean)*(*tensor_data - mean););
-  sum /= (THTensor_(nElement)(tensor)-1);
+  sum /= THTensor_(nElement)(tensor) - (biased ? 0 : 1);
   return sum;
 }
 
-accreal THTensor_(stdall)(THTensor *tensor)
+accreal THTensor_(stdall)(THTensor *tensor, int biased)
 {
-  return sqrt(THTensor_(varall)(tensor));
+  return sqrt(THTensor_(varall)(tensor, biased));
 }
 
 void THTensor_(linspace)(THTensor *r_, real a, real b, long n)
diff --git a/lib/TH/generic/THTensorMath.h b/lib/TH/generic/THTensorMath.h
index a3cf410..d0963b1 100644
--- a/lib/TH/generic/THTensorMath.h
+++ b/lib/TH/generic/THTensorMath.h
@@ -18,12 +18,14 @@ TH_API void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index,
 
 TH_API void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index);
 TH_API void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
+TH_API void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
 TH_API void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val);
 
 TH_API accreal THTensor_(dot)(THTensor *t, THTensor *src);
 
 TH_API real THTensor_(minall)(THTensor *t);
 TH_API real THTensor_(maxall)(THTensor *t);
+TH_API real THTensor_(medianall)(THTensor *t);
 TH_API accreal THTensor_(sumall)(THTensor *t);
 TH_API accreal THTensor_(prodall)(THTensor *t);
 
@@ -91,6 +93,7 @@ TH_API void THTensor_(zeros)(THTensor *r_, THLongStorage *size);
 TH_API void THTensor_(ones)(THTensor *r_, THLongStorage *size);
 TH_API void THTensor_(diag)(THTensor *r_, THTensor *t, int k);
 TH_API void THTensor_(eye)(THTensor *r_, long n, long m);
+TH_API void THTensor_(arange)(THTensor *r_, accreal xmin, accreal xmax, accreal step);
 TH_API void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step);
 TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, long n);
 
@@ -166,8 +169,8 @@ TH_API void THTensor_(frac)(THTensor *r_, THTensor *t);
 TH_API void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight);
 
 TH_API void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim);
-TH_API void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim);
-TH_API void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim);
+TH_API void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim);
+TH_API void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim);
 TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim);
 TH_API void THTensor_(renorm)(THTensor *r_, THTensor *t, real value, int dimension, real maxnorm);
 TH_API accreal THTensor_(dist)(THTensor *a, THTensor *b, real value);
@@ -175,8 +178,8 @@ TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real
 TH_API void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);
 
 TH_API accreal THTensor_(meanall)(THTensor *self);
-TH_API accreal THTensor_(varall)(THTensor *self);
-TH_API accreal THTensor_(stdall)(THTensor *self);
+TH_API accreal THTensor_(varall)(THTensor *self, int biased);
+TH_API accreal THTensor_(stdall)(THTensor *self, int biased);
 TH_API accreal THTensor_(normall)(THTensor *t, real value);
 
 TH_API void THTensor_(linspace)(THTensor *r_, real a, real b, long n);
diff --git a/lib/TH/generic/THTensorRandom.c b/lib/TH/generic/THTensorRandom.c
index 514d3dd..595cfa7 100644
--- a/lib/TH/generic/THTensorRandom.c
+++ b/lib/TH/generic/THTensorRandom.c
@@ -70,6 +70,116 @@ void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean,
   TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_logNormal(_generator, mean, stdv););
 }
 
+
+void THTensor_(multinomialAliasSetup)(THTensor *probs, THLongTensor *J, THTensor *q)
+{
+  long inputsize = THTensor_(nElement)(probs);
+  long i = 0;
+  THLongTensor *smaller = THLongTensor_newWithSize1d(inputsize);
+  THLongTensor *larger = THLongTensor_newWithSize1d(inputsize);
+  long small_c = 0;
+  long large_c = 0;
+  THLongTensor_resize1d(J, inputsize);
+  THTensor_(resize1d)(q, inputsize);
+  real *q_data = THTensor_(data)(q);
+  long *J_data = THLongTensor_data(J);
+      
+  for(i = 0; i < inputsize; i++)
+    {
+      THTensor_fastSet1d(J, i, 0L);
+      real val = THTensor_fastGet1d(probs, i);
+      THTensor_fastSet1d(q, i, inputsize*val);
+      
+      if (inputsize * val < 1.0)
+        {
+          THTensor_fastSet1d(smaller, small_c, i);
+          small_c += 1;
+        }
+      else
+        {
+          THTensor_fastSet1d(larger, large_c, i);
+          large_c += 1;
+        }
+    }
+
+  // Loop through and create little binary mixtures that
+  // appropriately allocate the larger outcomes over the
+  // overall uniform mixture.
+  long large, small;
+  while(small_c > 0 && large_c > 0)
+    {
+      large = THTensor_fastGet1d(larger, large_c-1);
+      small = THTensor_fastGet1d(smaller, small_c-1);
+      
+      THTensor_fastSet1d(J, small, large);
+      q_data[large * q->stride[0]] -= 1.0 - THTensor_fastGet1d(q, small);
+
+      if(q_data[large] < 1.0)
+        {
+          THTensor_fastSet1d(smaller, small_c-1, large);
+          large_c -= 1;
+        }
+      else
+        {
+          THTensor_fastSet1d(larger, large_c-1, large);
+          small_c -= 1;
+        }
+    }
+
+  real q_min = THTensor_fastGet1d(q, inputsize-1);
+  real q_max = q_min;
+  real q_temp;
+  for(i=0; i < inputsize; i++)
+    {
+      q_temp = THTensor_fastGet1d(q, i);
+      if(q_temp < q_min)
+        q_min = q_temp;
+      else if(q_temp > q_max)
+        q_max = q_temp;
+    }
+  THArgCheckWithCleanup((q_min > 0),
+                        THCleanup(THLongTensor_free(smaller); THLongTensor_free(larger);), 2,
+                        "q_min is less than 0");
+  
+  if(q_max > 1)
+    {
+      for(i=0; i < inputsize; i++)
+        {
+          q_data[i*q->stride[0]] /= q_max;
+        }
+    }
+  for(i=0; i<inputsize; i++)
+    {
+      // sometimes an large index isn't added to J. 
+      // fix it by making the probability 1 so that J isn't indexed.
+      if(J_data[i] <= 0)
+        q_data[i] = 1.0;
+    }
+  THLongTensor_free(smaller);
+  THLongTensor_free(larger);
+}
+void THTensor_(multinomialAliasDraw)(THLongTensor *self, THGenerator *_generator, THLongTensor *J, THTensor *q)
+{
+  long K = THLongTensor_nElement(J);
+  long output_nelem = THLongTensor_nElement(self);
+  
+  int i = 0, _mask=0;
+  real _q;
+  long rand_ind, sample_idx, J_sample, kk_sample;
+  for(i=0; i< output_nelem; i++)
+    {
+      rand_ind = (long)THRandom_uniform(_generator, 0, K) ;
+      _q = THTensor_fastGet1d(q, rand_ind);
+
+      _mask = THRandom_bernoulli(_generator, _q);
+      
+      J_sample = THTensor_fastGet1d(J, rand_ind);
+
+      sample_idx = J_sample*(1 -_mask) + (rand_ind+1L) * _mask;
+
+      THTensor_fastSet1d(self, i, sample_idx-1L);
+    }
+}
 void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement)
 {
   int start_dim = THTensor_(nDimension)(prob_dist);
diff --git a/lib/TH/generic/THTensorRandom.h b/lib/TH/generic/THTensorRandom.h
index d205142..e39d589 100644
--- a/lib/TH/generic/THTensorRandom.h
+++ b/lib/TH/generic/THTensorRandom.h
@@ -15,6 +15,8 @@ TH_API void THTensor_(exponential)(THTensor *self, THGenerator *_generator, doub
 TH_API void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma);
 TH_API void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean, double stdv);
 TH_API void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement);
+TH_API void THTensor_(multinomialAliasSetup)(THTensor *prob_dist, THLongTensor *J, THTensor *q);
+TH_API void THTensor_(multinomialAliasDraw)(THLongTensor *self, THGenerator *_generator, THLongTensor *J, THTensor *q);
 #endif
 
 #if defined(TH_REAL_IS_BYTE)
diff --git a/lib/luaT/CMakeLists.txt b/lib/luaT/CMakeLists.txt
index f33768c..072991c 100644
--- a/lib/luaT/CMakeLists.txt
+++ b/lib/luaT/CMakeLists.txt
@@ -9,9 +9,14 @@ IF(LUALIB)
 ENDIF()
 
 ADD_LIBRARY(luaT SHARED luaT.h luaT.c)
-if(BUILD_STATIC)
+
+IF (BUILD_STATIC OR "$ENV{STATIC_TH}" STREQUAL "YES")
   ADD_LIBRARY(luaT_static STATIC luaT.h luaT.c)
-endif()
+  SET_TARGET_PROPERTIES(luaT_static PROPERTIES
+    COMPILE_FLAGS "-fPIC")
+  SET_TARGET_PROPERTIES(luaT_static PROPERTIES
+    PREFIX "lib" IMPORT_PREFIX "lib" OUTPUT_NAME "luaT")
+ENDIF()
 
 SET_TARGET_PROPERTIES(luaT PROPERTIES
   VERSION   0
@@ -41,5 +46,5 @@ GET_FILENAME_COMPONENT(LUAT_OUTPUT_NAME ${LUAT_OUTPUT_NAME} NAME)
 SET(LUAT_LIBRARIES "${Torch_INSTALL_LIB}/${LUAT_OUTPUT_NAME}")
 SET(LUAT_INCLUDE_DIR "${Torch_INSTALL_INCLUDE}")
 CONFIGURE_FILE(luaTConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/luaTConfig.cmake")
-INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/luaTConfig.cmake" 
+INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/luaTConfig.cmake"
   DESTINATION "${Torch_INSTALL_CMAKE_SUBDIR}")
diff --git a/test/test.lua b/test/test.lua
index 2abf016..7b83b9d 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -1811,6 +1811,29 @@ function torchtest.multinomialwithoutreplacement()
       end
    end
 end
+function torchtest.aliasMultinomial()
+   for i =1,5 do
+      local n_class = 5
+      local t=os.time()
+      torch.manualSeed(t)
+      local probs = torch.Tensor(n_class):uniform(0,1)
+      probs:div(probs:sum())
+      local output = torch.LongTensor(1000, 10000)
+      local n_samples = output:nElement()
+      local prob_state = torch.multinomialAliasSetup(probs)
+      mytester:assert(prob_state[1]:min() > 0, "Index ="..prob_state[1]:min().."alias indices has an index below or equal to 0")
+      mytester:assert(prob_state[1]:max() <= n_class, prob_state[1]:max().." alias indices has an index exceeding num_class")
+      local prob_state = torch.multinomialAliasSetup(probs, prob_state)
+      mytester:assert(prob_state[1]:min() > 0, "Index ="..prob_state[1]:min().."alias indices has an index below or equal to 0(cold)")
+      mytester:assert(prob_state[1]:max() <= n_class, prob_state[1]:max()..","..prob_state[1]:min().." alias indices has an index exceeding num_class(cold)")
+      local output = torch.LongTensor(n_samples)
+      output = torch.multinomialAlias(output, prob_state)
+      mytester:assert(output:nElement() == n_samples, "wrong number of samples")
+      mytester:assert(output:min() > 0, "sampled indices has an index below or equal to 0")
+      mytester:assert(output:max() <= n_class, "indices has an index exceeding num_class")
+   end
+
+end
 function torchtest.multinomialvector()
    local n_col = 4
    local t=os.time()
diff --git a/test/test_aliasMultinomial.lua b/test/test_aliasMultinomial.lua
new file mode 100644
index 0000000..d935e81
--- /dev/null
+++ b/test/test_aliasMultinomial.lua
@@ -0,0 +1,40 @@
+local tester = torch.Tester()
+
+
+local function aliasMultinomial()
+   local n_class = 10000
+   local probs = torch.Tensor(n_class):uniform(0,1)
+   probs:div(probs:sum())
+   local a = torch.Timer()
+   local state = torch.multinomialAliasSetup(probs)
+   print("AliasMultinomial setup in "..a:time().real.." seconds(hot)")
+   a:reset()
+   state = torch.multinomialAliasSetup(probs, state)
+   print("AliasMultinomial setup in "..a:time().real.." seconds(cold)")
+   a:reset()
+   
+   tester:assert(state[1]:min() >= 0, "Index ="..state[1]:min().."alias indices has an index below or equal to 0")
+   tester:assert(state[1]:max() <= n_class, state[1]:max().." alias indices has an index exceeding num_class")
+   local output = torch.LongTensor(1000000)
+   torch.multinomialAlias(output, state)
+   local n_samples = output:nElement()
+   print("AliasMultinomial draw "..n_samples.." elements from "..n_class.." classes ".."in "..a:time().real.." seconds")
+   local counts = torch.Tensor(n_class):zero()
+   mult_output = torch.multinomial(probs, n_samples, true)
+   print("Multinomial draw "..n_samples.." elements from "..n_class.." classes ".." in "..a:time().real.." seconds")
+   tester:assert(output:min() > 0, "sampled indices has an index below or equal to 0")
+   tester:assert(output:max() <= n_class, "indices has an index exceeding num_class")
+   output:apply(function(x)
+         counts[x] = counts[x] + 1
+   end)
+   a:reset()
+   
+   counts:div(counts:sum())
+   
+   tester:assert(state[1]:min() >= 0, "Index ="..state[1]:min().."alias indices has an index below or equal to 0")
+   tester:assert(state[1]:max() <= n_class, state[1]:max().." alias indices has an index exceeding num_class")
+   tester:eq(probs, counts, 0.001, "probs and counts should be approximately equal")
+end
+
+tester:add(aliasMultinomial)
+tester:run()

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-torch7.git